third_party/hunspell/src/parsers/textparser.cxx - Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4

Unified Diff: third_party/hunspell/src/parsers/textparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)

Patch Set: Test Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/hunspell/src/parsers/textparser.cxx

diff --git a/third_party/hunspell/src/parsers/textparser.cxx b/third_party/hunspell/src/parsers/textparser.cxx

index 0338136808370e52b0bcd5955ad3469fd9faed38..7f94a2aa7b04b451fe2678cb7571a718e623cca7 100644

--- a/third_party/hunspell/src/parsers/textparser.cxx

+++ b/third_party/hunspell/src/parsers/textparser.cxx

@@ -1,3 +1,43 @@

+/* ***** BEGIN LICENSE BLOCK *****

+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1

+ *

+ * The contents of this file are subject to the Mozilla Public License Version

+ * 1.1 (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ * http://www.mozilla.org/MPL/

+ *

+ * Software distributed under the License is distributed on an "AS IS" basis,

+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

+ * for the specific language governing rights and limitations under the

+ * License.

+ *

+ * The Original Code is Hunspell, based on MySpell.

+ *

+ * The Initial Developers of the Original Code are

+ * Kevin Hendricks (MySpell) and Németh László (Hunspell).

+ *

+ * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,

+ * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,

+ * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,

+ * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,

+ * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen

+ *

+ * Alternatively, the contents of this file may be used under the terms of

+ * either the GNU General Public License Version 2 or later (the "GPL"), or

+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),

+ * in which case the provisions of the GPL or the LGPL are applicable instead

+ * of those above. If you wish to allow use of your version of this file only

+ * under the terms of either the GPL or the LGPL, and not to allow others to

+ * use your version of this file under the terms of the MPL, indicate your

+ * decision by deleting the provisions above and replace them with the notice

+ * and other provisions required by the GPL or the LGPL. If you do not delete

+ * the provisions above, a recipient may use your version of this file under

+ * the terms of any one of the MPL, the GPL or the LGPL.

+ *

+ * ***** END LICENSE BLOCK ***** */

#include <cstdlib>

#include <cstring>

#include <cstdio>

@@ -6,286 +46,259 @@

#include "../hunspell/csutil.hxx"

#include "textparser.hxx"

+#include <algorithm>

#ifndef W32

using namespace std;

#endif

// ISO-8859-1 HTML character entities

-static const char * LATIN1[] = {

- "À",

- "Ã",

- "Å",

- "Æ",

- "È",

- "Ê",

- "Ì",

- "Ï",

- "Ð",

- "Ñ",

- "Ò",

- "Ø",

- "Ù",

- "Þ",

- "à",

- "ã",

- "å",

- "æ",

- "è",

- "ê",

- "ì",

- "ï",

- "ð",

- "ñ",

- "ò",

- "ø",

- "ù",

- "þ",

- "ÿ"

-};

+static const char* LATIN1[] = {

+ "À", "Ã", "Å", "Æ", "È", "Ê",

+ "Ì", "Ï", "Ð", "Ñ", "Ò", "Ø",

+ "Ù", "Þ", "à", "ã", "å", "æ",

+ "è", "ê", "ì", "ï", "ð", "ñ",

+ "ò", "ø", "ù", "þ", "ÿ"};

-#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))

+#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))

-TextParser::TextParser() {

- init((char *) NULL);

+#define ENTITY_APOS "'"

+#define UTF8_APOS "\xe2\x80\x99"

+#define APOSTROPHE "'"

-TextParser::TextParser(const char * wordchars)

- init(wordchars);

+TextParser::TextParser(const char* wordchars) {

+ init(wordchars);

}

-TextParser::TextParser(unsigned short * wordchars, int len)

- init(wordchars, len);

+TextParser::TextParser(const w_char* wordchars, int len) {

+ init(wordchars, len);

}

-TextParser::~TextParser()

+TextParser::~TextParser() {}

-int TextParser::is_wordchar(char * w)

- if (*w == '\0') return 0;

- if (utf8) {

- w_char wc;

- unsigned short idx;

- u8_u16(&wc, 1, w);

- idx = (wc.h << 8) + wc.l;

- return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen)));

- } else {

- return wordcharacters[(*w + 256) % 256];

- }

+int TextParser::is_wordchar(const char* w) {

+ if (*w == '\0')

+ return 0;

+ if (utf8) {

+ std::vector<w_char> wc;

+ unsigned short idx;

+ u8_u16(wc, w);

+ if (wc.empty())

+ return 0;

+ idx = (wc[0].h << 8) + wc[0].l;

+ return (unicodeisalpha(idx) ||

+ (wordchars_utf16 &&

+ std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0])));

+ } else {

+ return wordcharacters[(*w + 256) % 256];

+ }

}

-const char * TextParser::get_latin1(char * s)

- if (s[0] == '&') {

- unsigned int i = 0;

- while ((i < LATIN1_LEN) &&

- strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;

- if (i != LATIN1_LEN) return LATIN1[i];

- }

- return NULL;

+const char* TextParser::get_latin1(const char* s) {

+ if (s[0] == '&') {

+ unsigned int i = 0;

+ while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])))

+ i++;

+ if (i != LATIN1_LEN)

+ return LATIN1[i];

+ }

+ return NULL;

}

-void TextParser::init(const char * wordchars)

- for (int i = 0; i < MAXPREVLINE; i++) {

- line[i][0] = '\0';

- }

- actual = 0;

- head = 0;

- token = 0;

- state = 0;

- utf8 = 0;

- checkurl = 0;

- unsigned int j;

- for (j = 0; j < 256; j++) {

- wordcharacters[j] = 0;

- }

- if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";

- for (j = 0; j < strlen(wordchars); j++) {

- wordcharacters[(wordchars[j] + 256) % 256] = 1;

- }

+void TextParser::init(const char* wordchars) {

+ actual = 0;

+ head = 0;

+ token = 0;

+ state = 0;

+ utf8 = 0;

+ checkurl = 0;

+ wordchars_utf16 = NULL;

+ wclen = 0;

+ unsigned int j;

+ for (j = 0; j < 256; j++) {

+ wordcharacters[j] = 0;

+ }

+ if (!wordchars)

+ wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";

+ for (j = 0; j < strlen(wordchars); j++) {

+ wordcharacters[(wordchars[j] + 256) % 256] = 1;

+ }

}

-void TextParser::init(unsigned short * wc, int len)

- for (int i = 0; i < MAXPREVLINE; i++) {

- line[i][0] = '\0';

- }

- actual = 0;

- head = 0;

- token = 0;

- state = 0;

- utf8 = 1;

- checkurl = 0;

- wordchars_utf16 = wc;

- wclen = len;

+void TextParser::init(const w_char* wc, int len) {

+ actual = 0;

+ head = 0;

+ token = 0;

+ state = 0;

+ utf8 = 1;

+ checkurl = 0;

+ wordchars_utf16 = wc;

+ wclen = len;

}

-int TextParser::next_char(char * line, int * pos) {

- if (*(line + *pos) == '\0') return 1;

- if (utf8) {

- if (*(line + *pos) >> 7) {

- // jump to next UTF-8 character

- for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);

- } else {

- (*pos)++;

- }

- } else (*pos)++;

- return 0;

+int TextParser::next_char(const char* ln, size_t* pos) {

+ if (*(ln + *pos) == '\0')

+ return 1;

+ if (utf8) {

+ if (*(ln + *pos) >> 7) {

+ // jump to next UTF-8 character

+ for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)

+ ;

+ } else {

+ (*pos)++;

+ }

+ } else

+ (*pos)++;

+ return 0;

}

-void TextParser::put_line(char * word)

- actual = (actual + 1) % MAXPREVLINE;

- strcpy(line[actual], word);

- token = 0;

- head = 0;

- check_urls();

+void TextParser::put_line(const char* word) {

+ actual = (actual + 1) % MAXPREVLINE;

+ line[actual].assign(word);

+ token = 0;

+ head = 0;

+ check_urls();

}

-char * TextParser::get_prevline(int n)

- return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);

+std::string TextParser::get_prevline(int n) const {

+ return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];

}

-char * TextParser::get_line()

- return get_prevline(0);

+std::string TextParser::get_line() const {

+ return get_prevline(0);

}

-char * TextParser::next_token()

- const char * latin1;

- for (;;) {

- switch (state)

- {

- case 0: // non word chars

- if (is_wordchar(line[actual] + head)) {

- state = 1;

- token = head;

- } else if ((latin1 = get_latin1(line[actual] + head))) {

- state = 1;

- token = head;

- head += strlen(latin1);

- }

- break;

- case 1: // wordchar

- if ((latin1 = get_latin1(line[actual] + head))) {

- head += strlen(latin1);

- } else if (! is_wordchar(line[actual] + head)) {

- state = 0;

- char * t = alloc_token(token, &head);

- if (t) return t;

- }

- break;

- }

- if (next_char(line[actual], &head)) return NULL;

- }

+bool TextParser::next_token(std::string &t) {

+ const char* latin1;

-int TextParser::get_tokenpos()

- return token;

+ for (;;) {

+ switch (state) {

+ case 0: // non word chars

+ if (is_wordchar(line[actual].c_str() + head)) {

+ state = 1;

+ token = head;

+ } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {

+ state = 1;

+ token = head;

+ head += strlen(latin1);

+ }

+ break;

+ case 1: // wordchar

+ if ((latin1 = get_latin1(line[actual].c_str() + head))) {

+ head += strlen(latin1);

+ } else if ((is_wordchar((char*)APOSTROPHE) ||

+ (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&

+ !line[actual].empty() && line[actual][head] == '\'' &&

+ is_wordchar(line[actual].c_str() + head + 1)) {

+ head++;

+ } else if (is_utf8() &&

+ is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe

+ // to the WORDCHARS, if

+ // needed

+ strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==

+ 0 &&

+ is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {

+ head += strlen(UTF8_APOS) - 1;

+ } else if (!is_wordchar(line[actual].c_str() + head)) {

+ state = 0;

+ if (alloc_token(token, &head, t))

+ return true;

+ }

+ break;

+ }

+ if (next_char(line[actual].c_str(), &head))

+ return false;

+ }

}

-int TextParser::change_token(const char * word)

- if (word) {

- char * r = mystrdup(line[actual] + head);

- strcpy(line[actual] + token, word);

- strcat(line[actual], r);

- head = token;

- free(r);

- return 1;

- }

- return 0;

+size_t TextParser::get_tokenpos() {

+ return token;

}

-void TextParser::check_urls()

- int url_state = 0;

- int url_head = 0;

- int url_token = 0;

- int url = 0;

- for (;;) {

- switch (url_state)

- {

- case 0: // non word chars

- if (is_wordchar(line[actual] + url_head)) {

- url_state = 1;

- url_token = url_head;

- // Unix path

- } else if (*(line[actual] + url_head) == '/') {

- url_state = 1;

- url_token = url_head;

- url = 1;

- }

- break;

- case 1: // wordchar

- char ch = *(line[actual] + url_head);

- // e-mail address

- if ((ch == '@') ||

- // MS-DOS, Windows path

- (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||

- // URL

- (strncmp(line[actual] + url_head, "://", 3) == 0)) {

- url = 1;

- } else if (! (is_wordchar(line[actual] + url_head) ||

- (ch == '-') || (ch == '_') || (ch == '\\') ||

- (ch == '.') || (ch == ':') || (ch == '/') ||

- (ch == '~') || (ch == '%') || (ch == '*') ||

- (ch == '$') || (ch == '[') || (ch == ']') ||

- (ch == '?') || (ch == '!') ||

- ((ch >= '0') && (ch <= '9')))) {

- url_state = 0;

- if (url == 1) {

- for (int i = url_token; i < url_head; i++) {

- *(urlline + i) = 1;

- }

- url = 0;

- }

- break;

- }

- *(urlline + url_head) = 0;

- if (next_char(line[actual], &url_head)) return;

- }

+int TextParser::change_token(const char* word) {

+ if (word) {

+ std::string remainder(line[actual].substr(head));

+ line[actual].resize(token);

+ line[actual].append(word);

+ line[actual].append(remainder);

+ head = token;

+ return 1;

+ }

+ return 0;

}

-int TextParser::get_url(int token_pos, int * head)

- for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);

- return checkurl ? 0 : urlline[token_pos];

+void TextParser::check_urls() {

+ urlline.resize(line[actual].size() + 1);

+ int url_state = 0;

+ size_t url_head = 0;

+ size_t url_token = 0;

+ int url = 0;

+ for (;;) {

+ switch (url_state) {

+ case 0: // non word chars

+ if (is_wordchar(line[actual].c_str() + url_head)) {

+ url_state = 1;

+ url_token = url_head;

+ // Unix path

+ } else if (line[actual][url_head] == '/') {

+ url_state = 1;

+ url_token = url_head;

+ url = 1;

+ }

+ break;

+ case 1: // wordchar

+ char ch = line[actual][url_head];

+ // e-mail address

+ if ((ch == '@') ||

+ // MS-DOS, Windows path

+ (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||

+ // URL

+ (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {

+ url = 1;

+ } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||

+ (ch == '_') || (ch == '\\') || (ch == '.') ||

+ (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||

+ (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||

+ (ch == '?') || (ch == '!') ||

+ ((ch >= '0') && (ch <= '9')))) {

+ url_state = 0;

+ if (url == 1) {

+ for (size_t i = url_token; i < url_head; ++i) {

+ urlline[i] = true;

+ }

+ url = 0;

+ }

+ break;

+ }

+ urlline[url_head] = false;

+ if (next_char(line[actual].c_str(), &url_head))

+ return;

+ }

}

-void TextParser::set_url_checking(int check)

- checkurl = check;

+int TextParser::get_url(size_t token_pos, size_t* hd) {

+ for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)

+ ;

+ return checkurl ? 0 : urlline[token_pos];

}

+void TextParser::set_url_checking(int check) {

+ checkurl = check;

-char * TextParser::alloc_token(int token, int * head)

- if (get_url(token, head)) return NULL;

- char * t = (char *) malloc(*head - token + 1);

- if (t) {

- t[*head - token] = '\0';

- strncpy(t, line[actual] + token, *head - token);

- // remove colon for Finnish and Swedish language

- if (t[*head - token - 1] == ':') {

- t[*head - token - 1] = '\0';

- if (!t[0]) {

- free(t);

- return NULL;

- }

- return t;

+bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {

+ size_t url_head = *hd;

+ if (get_url(tokn, &url_head))

+ return false;

+ t = line[actual].substr(tokn, *hd - tokn);

+ // remove colon for Finnish and Swedish language

+ if (!t.empty() && t[t.size() - 1] == ':') {

+ t.resize(t.size() - 1);

+ if (t.empty()) {

+ return false;

}

- fprintf(stderr,"Error - Insufficient Memory\n");

- return NULL;

+ }

+ return true;

}

« no previous file with comments | « third_party/hunspell/src/parsers/textparser.hxx ('k') | third_party/hunspell/src/parsers/xmlparser.hxx » ('j') | no next file with comments »