| Index: third_party/hunspell/src/parsers/textparser.cxx
|
| diff --git a/third_party/hunspell/src/parsers/textparser.cxx b/third_party/hunspell/src/parsers/textparser.cxx
|
| index 0338136808370e52b0bcd5955ad3469fd9faed38..7f94a2aa7b04b451fe2678cb7571a718e623cca7 100644
|
| --- a/third_party/hunspell/src/parsers/textparser.cxx
|
| +++ b/third_party/hunspell/src/parsers/textparser.cxx
|
| @@ -1,3 +1,43 @@
|
| +/* ***** BEGIN LICENSE BLOCK *****
|
| + * Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
| + *
|
| + * The contents of this file are subject to the Mozilla Public License Version
|
| + * 1.1 (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + * http://www.mozilla.org/MPL/
|
| + *
|
| + * Software distributed under the License is distributed on an "AS IS" basis,
|
| + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
| + * for the specific language governing rights and limitations under the
|
| + * License.
|
| + *
|
| + * The Original Code is Hunspell, based on MySpell.
|
| + *
|
| + * The Initial Developers of the Original Code are
|
| + * Kevin Hendricks (MySpell) and Németh László (Hunspell).
|
| + * Portions created by the Initial Developers are Copyright (C) 2002-2005
|
| + * the Initial Developers. All Rights Reserved.
|
| + *
|
| + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
|
| + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
|
| + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
|
| + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
|
| + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
|
| + *
|
| + * Alternatively, the contents of this file may be used under the terms of
|
| + * either the GNU General Public License Version 2 or later (the "GPL"), or
|
| + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
| + * in which case the provisions of the GPL or the LGPL are applicable instead
|
| + * of those above. If you wish to allow use of your version of this file only
|
| + * under the terms of either the GPL or the LGPL, and not to allow others to
|
| + * use your version of this file under the terms of the MPL, indicate your
|
| + * decision by deleting the provisions above and replace them with the notice
|
| + * and other provisions required by the GPL or the LGPL. If you do not delete
|
| + * the provisions above, a recipient may use your version of this file under
|
| + * the terms of any one of the MPL, the GPL or the LGPL.
|
| + *
|
| + * ***** END LICENSE BLOCK ***** */
|
| +
|
| #include <cstdlib>
|
| #include <cstring>
|
| #include <cstdio>
|
| @@ -6,286 +46,259 @@
|
| #include "../hunspell/csutil.hxx"
|
| #include "textparser.hxx"
|
|
|
| +#include <algorithm>
|
| +
|
| #ifndef W32
|
| using namespace std;
|
| #endif
|
|
|
| // ISO-8859-1 HTML character entities
|
|
|
| -static const char * LATIN1[] = {
|
| - "À",
|
| - "Ã",
|
| - "Å",
|
| - "Æ",
|
| - "È",
|
| - "Ê",
|
| - "Ì",
|
| - "Ï",
|
| - "Ð",
|
| - "Ñ",
|
| - "Ò",
|
| - "Ø",
|
| - "Ù",
|
| - "Þ",
|
| - "à",
|
| - "ã",
|
| - "å",
|
| - "æ",
|
| - "è",
|
| - "ê",
|
| - "ì",
|
| - "ï",
|
| - "ð",
|
| - "ñ",
|
| - "ò",
|
| - "ø",
|
| - "ù",
|
| - "þ",
|
| - "ÿ"
|
| -};
|
| +static const char* LATIN1[] = {
|
| + "À", "Ã", "Å", "Æ", "È", "Ê",
|
| + "Ì", "Ï", "Ð", "Ñ", "Ò", "Ø",
|
| + "Ù", "Þ", "à", "ã", "å", "æ",
|
| + "è", "ê", "ì", "ï", "ð", "ñ",
|
| + "ò", "ø", "ù", "þ", "ÿ"};
|
|
|
| -#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
|
| +#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))
|
|
|
| -TextParser::TextParser() {
|
| - init((char *) NULL);
|
| -}
|
| +#define ENTITY_APOS "'"
|
| +#define UTF8_APOS "\xe2\x80\x99"
|
| +#define APOSTROPHE "'"
|
|
|
| -TextParser::TextParser(const char * wordchars)
|
| -{
|
| - init(wordchars);
|
| +TextParser::TextParser(const char* wordchars) {
|
| + init(wordchars);
|
| }
|
|
|
| -TextParser::TextParser(unsigned short * wordchars, int len)
|
| -{
|
| - init(wordchars, len);
|
| +TextParser::TextParser(const w_char* wordchars, int len) {
|
| + init(wordchars, len);
|
| }
|
|
|
| -TextParser::~TextParser()
|
| -{
|
| -}
|
| +TextParser::~TextParser() {}
|
|
|
| -int TextParser::is_wordchar(char * w)
|
| -{
|
| - if (*w == '\0') return 0;
|
| - if (utf8) {
|
| - w_char wc;
|
| - unsigned short idx;
|
| - u8_u16(&wc, 1, w);
|
| - idx = (wc.h << 8) + wc.l;
|
| - return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen)));
|
| - } else {
|
| - return wordcharacters[(*w + 256) % 256];
|
| - }
|
| +int TextParser::is_wordchar(const char* w) {
|
| + if (*w == '\0')
|
| + return 0;
|
| + if (utf8) {
|
| + std::vector<w_char> wc;
|
| + unsigned short idx;
|
| + u8_u16(wc, w);
|
| + if (wc.empty())
|
| + return 0;
|
| + idx = (wc[0].h << 8) + wc[0].l;
|
| + return (unicodeisalpha(idx) ||
|
| + (wordchars_utf16 &&
|
| + std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0])));
|
| + } else {
|
| + return wordcharacters[(*w + 256) % 256];
|
| + }
|
| }
|
|
|
| -const char * TextParser::get_latin1(char * s)
|
| -{
|
| - if (s[0] == '&') {
|
| - unsigned int i = 0;
|
| - while ((i < LATIN1_LEN) &&
|
| - strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;
|
| - if (i != LATIN1_LEN) return LATIN1[i];
|
| - }
|
| - return NULL;
|
| +const char* TextParser::get_latin1(const char* s) {
|
| + if (s[0] == '&') {
|
| + unsigned int i = 0;
|
| + while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])))
|
| + i++;
|
| + if (i != LATIN1_LEN)
|
| + return LATIN1[i];
|
| + }
|
| + return NULL;
|
| }
|
|
|
| -void TextParser::init(const char * wordchars)
|
| -{
|
| - for (int i = 0; i < MAXPREVLINE; i++) {
|
| - line[i][0] = '\0';
|
| - }
|
| - actual = 0;
|
| - head = 0;
|
| - token = 0;
|
| - state = 0;
|
| - utf8 = 0;
|
| - checkurl = 0;
|
| - unsigned int j;
|
| - for (j = 0; j < 256; j++) {
|
| - wordcharacters[j] = 0;
|
| - }
|
| - if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
|
| - for (j = 0; j < strlen(wordchars); j++) {
|
| - wordcharacters[(wordchars[j] + 256) % 256] = 1;
|
| - }
|
| +void TextParser::init(const char* wordchars) {
|
| + actual = 0;
|
| + head = 0;
|
| + token = 0;
|
| + state = 0;
|
| + utf8 = 0;
|
| + checkurl = 0;
|
| + wordchars_utf16 = NULL;
|
| + wclen = 0;
|
| + unsigned int j;
|
| + for (j = 0; j < 256; j++) {
|
| + wordcharacters[j] = 0;
|
| + }
|
| + if (!wordchars)
|
| + wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
|
| + for (j = 0; j < strlen(wordchars); j++) {
|
| + wordcharacters[(wordchars[j] + 256) % 256] = 1;
|
| + }
|
| }
|
|
|
| -void TextParser::init(unsigned short * wc, int len)
|
| -{
|
| - for (int i = 0; i < MAXPREVLINE; i++) {
|
| - line[i][0] = '\0';
|
| - }
|
| - actual = 0;
|
| - head = 0;
|
| - token = 0;
|
| - state = 0;
|
| - utf8 = 1;
|
| - checkurl = 0;
|
| - wordchars_utf16 = wc;
|
| - wclen = len;
|
| +void TextParser::init(const w_char* wc, int len) {
|
| + actual = 0;
|
| + head = 0;
|
| + token = 0;
|
| + state = 0;
|
| + utf8 = 1;
|
| + checkurl = 0;
|
| + wordchars_utf16 = wc;
|
| + wclen = len;
|
| }
|
|
|
| -int TextParser::next_char(char * line, int * pos) {
|
| - if (*(line + *pos) == '\0') return 1;
|
| - if (utf8) {
|
| - if (*(line + *pos) >> 7) {
|
| - // jump to next UTF-8 character
|
| - for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);
|
| - } else {
|
| - (*pos)++;
|
| - }
|
| - } else (*pos)++;
|
| - return 0;
|
| +int TextParser::next_char(const char* ln, size_t* pos) {
|
| + if (*(ln + *pos) == '\0')
|
| + return 1;
|
| + if (utf8) {
|
| + if (*(ln + *pos) >> 7) {
|
| + // jump to next UTF-8 character
|
| + for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)
|
| + ;
|
| + } else {
|
| + (*pos)++;
|
| + }
|
| + } else
|
| + (*pos)++;
|
| + return 0;
|
| }
|
|
|
| -void TextParser::put_line(char * word)
|
| -{
|
| - actual = (actual + 1) % MAXPREVLINE;
|
| - strcpy(line[actual], word);
|
| - token = 0;
|
| - head = 0;
|
| - check_urls();
|
| +void TextParser::put_line(const char* word) {
|
| + actual = (actual + 1) % MAXPREVLINE;
|
| + line[actual].assign(word);
|
| + token = 0;
|
| + head = 0;
|
| + check_urls();
|
| }
|
|
|
| -char * TextParser::get_prevline(int n)
|
| -{
|
| - return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);
|
| +std::string TextParser::get_prevline(int n) const {
|
| + return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
|
| }
|
|
|
| -char * TextParser::get_line()
|
| -{
|
| - return get_prevline(0);
|
| +std::string TextParser::get_line() const {
|
| + return get_prevline(0);
|
| }
|
|
|
| -char * TextParser::next_token()
|
| -{
|
| - const char * latin1;
|
| -
|
| - for (;;) {
|
| - switch (state)
|
| - {
|
| - case 0: // non word chars
|
| - if (is_wordchar(line[actual] + head)) {
|
| - state = 1;
|
| - token = head;
|
| - } else if ((latin1 = get_latin1(line[actual] + head))) {
|
| - state = 1;
|
| - token = head;
|
| - head += strlen(latin1);
|
| - }
|
| - break;
|
| - case 1: // wordchar
|
| - if ((latin1 = get_latin1(line[actual] + head))) {
|
| - head += strlen(latin1);
|
| - } else if (! is_wordchar(line[actual] + head)) {
|
| - state = 0;
|
| - char * t = alloc_token(token, &head);
|
| - if (t) return t;
|
| - }
|
| - break;
|
| - }
|
| - if (next_char(line[actual], &head)) return NULL;
|
| - }
|
| -}
|
| +bool TextParser::next_token(std::string &t) {
|
| + const char* latin1;
|
|
|
| -int TextParser::get_tokenpos()
|
| -{
|
| - return token;
|
| + for (;;) {
|
| + switch (state) {
|
| + case 0: // non word chars
|
| + if (is_wordchar(line[actual].c_str() + head)) {
|
| + state = 1;
|
| + token = head;
|
| + } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
|
| + state = 1;
|
| + token = head;
|
| + head += strlen(latin1);
|
| + }
|
| + break;
|
| + case 1: // wordchar
|
| + if ((latin1 = get_latin1(line[actual].c_str() + head))) {
|
| + head += strlen(latin1);
|
| + } else if ((is_wordchar((char*)APOSTROPHE) ||
|
| + (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
|
| + !line[actual].empty() && line[actual][head] == '\'' &&
|
| + is_wordchar(line[actual].c_str() + head + 1)) {
|
| + head++;
|
| + } else if (is_utf8() &&
|
| + is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
|
| + // to the WORDCHARS, if
|
| + // needed
|
| + strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
|
| + 0 &&
|
| + is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
|
| + head += strlen(UTF8_APOS) - 1;
|
| + } else if (!is_wordchar(line[actual].c_str() + head)) {
|
| + state = 0;
|
| + if (alloc_token(token, &head, t))
|
| + return true;
|
| + }
|
| + break;
|
| + }
|
| + if (next_char(line[actual].c_str(), &head))
|
| + return false;
|
| + }
|
| }
|
|
|
| -int TextParser::change_token(const char * word)
|
| -{
|
| - if (word) {
|
| - char * r = mystrdup(line[actual] + head);
|
| - strcpy(line[actual] + token, word);
|
| - strcat(line[actual], r);
|
| - head = token;
|
| - free(r);
|
| - return 1;
|
| - }
|
| - return 0;
|
| +size_t TextParser::get_tokenpos() {
|
| + return token;
|
| }
|
|
|
| -void TextParser::check_urls()
|
| -{
|
| - int url_state = 0;
|
| - int url_head = 0;
|
| - int url_token = 0;
|
| - int url = 0;
|
| - for (;;) {
|
| - switch (url_state)
|
| - {
|
| - case 0: // non word chars
|
| - if (is_wordchar(line[actual] + url_head)) {
|
| - url_state = 1;
|
| - url_token = url_head;
|
| - // Unix path
|
| - } else if (*(line[actual] + url_head) == '/') {
|
| - url_state = 1;
|
| - url_token = url_head;
|
| - url = 1;
|
| - }
|
| - break;
|
| - case 1: // wordchar
|
| - char ch = *(line[actual] + url_head);
|
| - // e-mail address
|
| - if ((ch == '@') ||
|
| - // MS-DOS, Windows path
|
| - (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
|
| - // URL
|
| - (strncmp(line[actual] + url_head, "://", 3) == 0)) {
|
| - url = 1;
|
| - } else if (! (is_wordchar(line[actual] + url_head) ||
|
| - (ch == '-') || (ch == '_') || (ch == '\\') ||
|
| - (ch == '.') || (ch == ':') || (ch == '/') ||
|
| - (ch == '~') || (ch == '%') || (ch == '*') ||
|
| - (ch == '$') || (ch == '[') || (ch == ']') ||
|
| - (ch == '?') || (ch == '!') ||
|
| - ((ch >= '0') && (ch <= '9')))) {
|
| - url_state = 0;
|
| - if (url == 1) {
|
| - for (int i = url_token; i < url_head; i++) {
|
| - *(urlline + i) = 1;
|
| - }
|
| - }
|
| - url = 0;
|
| - }
|
| - break;
|
| - }
|
| - *(urlline + url_head) = 0;
|
| - if (next_char(line[actual], &url_head)) return;
|
| - }
|
| +int TextParser::change_token(const char* word) {
|
| + if (word) {
|
| + std::string remainder(line[actual].substr(head));
|
| + line[actual].resize(token);
|
| + line[actual].append(word);
|
| + line[actual].append(remainder);
|
| + head = token;
|
| + return 1;
|
| + }
|
| + return 0;
|
| }
|
|
|
| -int TextParser::get_url(int token_pos, int * head)
|
| -{
|
| - for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
|
| - return checkurl ? 0 : urlline[token_pos];
|
| +void TextParser::check_urls() {
|
| + urlline.resize(line[actual].size() + 1);
|
| + int url_state = 0;
|
| + size_t url_head = 0;
|
| + size_t url_token = 0;
|
| + int url = 0;
|
| + for (;;) {
|
| + switch (url_state) {
|
| + case 0: // non word chars
|
| + if (is_wordchar(line[actual].c_str() + url_head)) {
|
| + url_state = 1;
|
| + url_token = url_head;
|
| + // Unix path
|
| + } else if (line[actual][url_head] == '/') {
|
| + url_state = 1;
|
| + url_token = url_head;
|
| + url = 1;
|
| + }
|
| + break;
|
| + case 1: // wordchar
|
| + char ch = line[actual][url_head];
|
| + // e-mail address
|
| + if ((ch == '@') ||
|
| + // MS-DOS, Windows path
|
| + (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||
|
| + // URL
|
| + (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
|
| + url = 1;
|
| + } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||
|
| + (ch == '_') || (ch == '\\') || (ch == '.') ||
|
| + (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||
|
| + (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||
|
| + (ch == '?') || (ch == '!') ||
|
| + ((ch >= '0') && (ch <= '9')))) {
|
| + url_state = 0;
|
| + if (url == 1) {
|
| + for (size_t i = url_token; i < url_head; ++i) {
|
| + urlline[i] = true;
|
| + }
|
| + }
|
| + url = 0;
|
| + }
|
| + break;
|
| + }
|
| + urlline[url_head] = false;
|
| + if (next_char(line[actual].c_str(), &url_head))
|
| + return;
|
| + }
|
| }
|
|
|
| -void TextParser::set_url_checking(int check)
|
| -{
|
| - checkurl = check;
|
| +int TextParser::get_url(size_t token_pos, size_t* hd) {
|
| + for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)
|
| + ;
|
| + return checkurl ? 0 : urlline[token_pos];
|
| }
|
|
|
| +void TextParser::set_url_checking(int check) {
|
| + checkurl = check;
|
| +}
|
|
|
| -char * TextParser::alloc_token(int token, int * head)
|
| -{
|
| - if (get_url(token, head)) return NULL;
|
| - char * t = (char *) malloc(*head - token + 1);
|
| - if (t) {
|
| - t[*head - token] = '\0';
|
| - strncpy(t, line[actual] + token, *head - token);
|
| - // remove colon for Finnish and Swedish language
|
| - if (t[*head - token - 1] == ':') {
|
| - t[*head - token - 1] = '\0';
|
| - if (!t[0]) {
|
| - free(t);
|
| - return NULL;
|
| - }
|
| - }
|
| - return t;
|
| +bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
|
| + size_t url_head = *hd;
|
| + if (get_url(tokn, &url_head))
|
| + return false;
|
| + t = line[actual].substr(tokn, *hd - tokn);
|
| + // remove colon for Finnish and Swedish language
|
| + if (!t.empty() && t[t.size() - 1] == ':') {
|
| + t.resize(t.size() - 1);
|
| + if (t.empty()) {
|
| + return false;
|
| }
|
| - fprintf(stderr,"Error - Insufficient Memory\n");
|
| - return NULL;
|
| + }
|
| + return true;
|
| }
|
|
|