| Index: third_party/hunspell/src/parsers/htmlparser.cxx
|
| diff --git a/third_party/hunspell/src/parsers/htmlparser.cxx b/third_party/hunspell/src/parsers/htmlparser.cxx
|
| index 341be4e8948b0aee66621b301b6a17a6341ca271..7509c651b3ba317778cfd56828ce7ca2b7888ebf 100644
|
| --- a/third_party/hunspell/src/parsers/htmlparser.cxx
|
| +++ b/third_party/hunspell/src/parsers/htmlparser.cxx
|
| @@ -1,3 +1,43 @@
|
| +/* ***** BEGIN LICENSE BLOCK *****
|
| + * Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
| + *
|
| + * The contents of this file are subject to the Mozilla Public License Version
|
| + * 1.1 (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + * http://www.mozilla.org/MPL/
|
| + *
|
| + * Software distributed under the License is distributed on an "AS IS" basis,
|
| + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
| + * for the specific language governing rights and limitations under the
|
| + * License.
|
| + *
|
| + * The Original Code is Hunspell, based on MySpell.
|
| + *
|
| + * The Initial Developers of the Original Code are
|
| + * Kevin Hendricks (MySpell) and Németh László (Hunspell).
|
| + * Portions created by the Initial Developers are Copyright (C) 2002-2005
|
| + * the Initial Developers. All Rights Reserved.
|
| + *
|
| + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
|
| + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
|
| + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
|
| + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
|
| + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
|
| + *
|
| + * Alternatively, the contents of this file may be used under the terms of
|
| + * either the GNU General Public License Version 2 or later (the "GPL"), or
|
| + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
| + * in which case the provisions of the GPL or the LGPL are applicable instead
|
| + * of those above. If you wish to allow use of your version of this file only
|
| + * under the terms of either the GPL or the LGPL, and not to allow others to
|
| + * use your version of this file under the terms of the MPL, indicate your
|
| + * decision by deleting the provisions above and replace them with the notice
|
| + * and other provisions required by the GPL or the LGPL. If you do not delete
|
| + * the provisions above, a recipient may use your version of this file under
|
| + * the terms of any one of the MPL, the GPL or the LGPL.
|
| + *
|
| + * ***** END LICENSE BLOCK ***** */
|
| +
|
| #include <cstdlib>
|
| #include <cstring>
|
| #include <cstdio>
|
| @@ -6,146 +46,42 @@
|
| #include "../hunspell/csutil.hxx"
|
| #include "htmlparser.hxx"
|
|
|
| -
|
| #ifndef W32
|
| using namespace std;
|
| #endif
|
|
|
| -enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
|
| -
|
| -static const char * PATTERN[][2] = {
|
| - { "<script", "</script>" },
|
| - { "<style", "</style>" },
|
| - { "<code", "</code>" },
|
| - { "<samp", "</samp>" },
|
| - { "<kbd", "</kbd>" },
|
| - { "<var", "</var>" },
|
| - { "<listing", "</listing>" },
|
| - { "<address", "</address>" },
|
| - { "<pre", "</pre>" },
|
| - { "<!--", "-->" },
|
| - { "<[cdata[", "]]>" }, // XML comment
|
| - { "<", ">" }
|
| -};
|
| -
|
| -#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2))
|
| -
|
| -static const char * PATTERN2[][2] = {
|
| - { "<img", "alt=" }, // ALT and TITLE attrib handled spec.
|
| - { "<img", "title=" },
|
| - { "<a ", "title=" }
|
| -};
|
| -
|
| -#define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2))
|
| -
|
| -HTMLParser::HTMLParser(const char * wordchars)
|
| -{
|
| - init(wordchars);
|
| +static const char* PATTERN[][2] = {{"<script", "</script>"},
|
| + {"<style", "</style>"},
|
| + {"<code", "</code>"},
|
| + {"<samp", "</samp>"},
|
| + {"<kbd", "</kbd>"},
|
| + {"<var", "</var>"},
|
| + {"<listing", "</listing>"},
|
| + {"<address", "</address>"},
|
| + {"<pre", "</pre>"},
|
| + {"<!--", "-->"},
|
| + {"<[cdata[", "]]>"}, // XML comment
|
| + {"<", ">"}};
|
| +
|
| +#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2))
|
| +
|
| +static const char* PATTERN2[][2] = {
|
| + {"<img", "alt="}, // ALT and TITLE attrib handled spec.
|
| + {"<img", "title="},
|
| + {"<a ", "title="}};
|
| +
|
| +#define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char*) * 2))
|
| +
|
| +HTMLParser::HTMLParser(const char* wordchars)
|
| + : XMLParser(wordchars) {
|
| }
|
|
|
| -HTMLParser::HTMLParser(unsigned short * wordchars, int len)
|
| -{
|
| - init(wordchars, len);
|
| +HTMLParser::HTMLParser(const w_char* wordchars, int len)
|
| + : XMLParser(wordchars, len) {
|
| }
|
|
|
| -HTMLParser::~HTMLParser()
|
| -{
|
| -}
|
| -
|
| -
|
| -int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
|
| -{
|
| - for (unsigned int i = 0; i < len; i++) {
|
| - char * j = line[actual] + head;
|
| - const char * k = p[i][column];
|
| - while ((*k != '\0') && (tolower(*j) == *k)) {
|
| - j++;
|
| - k++;
|
| - }
|
| - if (*k == '\0') return i;
|
| - }
|
| - return -1;
|
| +bool HTMLParser::next_token(std::string& t) {
|
| + return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, t);
|
| }
|
|
|
| -/*
|
| - * HTML parser
|
| - *
|
| - */
|
| -
|
| -
|
| -char * HTMLParser::next_token()
|
| -{
|
| - const char * latin1;
|
| -
|
| - for (;;) {
|
| - //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[actual]);
|
| - //getch();
|
| - switch (state)
|
| - {
|
| - case ST_NON_WORD: // non word chars
|
| - prevstate = ST_NON_WORD;
|
| - if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
|
| - checkattr = 0;
|
| - if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
|
| - checkattr = 1;
|
| - }
|
| - state = ST_TAG;
|
| - } else if (is_wordchar(line[actual] + head)) {
|
| - state = ST_WORD;
|
| - token = head;
|
| - } else if ((latin1 = get_latin1(line[actual] + head))) {
|
| - state = ST_WORD;
|
| - token = head;
|
| - head += strlen(latin1);
|
| - } else if (line[actual][head] == '&') {
|
| - state = ST_CHAR_ENTITY;
|
| - }
|
| - break;
|
| - case ST_WORD: // wordchar
|
| - if ((latin1 = get_latin1(line[actual] + head))) {
|
| - head += strlen(latin1);
|
| - } else if (! is_wordchar(line[actual] + head)) {
|
| - state = prevstate;
|
| - char * t = alloc_token(token, &head);
|
| - if (t) return t;
|
| - }
|
| - break;
|
| - case ST_TAG: // comment, labels, etc
|
| - int i;
|
| - if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1)
|
| - && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) {
|
| - checkattr = 2;
|
| - } else if ((checkattr > 0) && (line[actual][head] == '>')) {
|
| - state = ST_NON_WORD;
|
| - } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
|
| - (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) {
|
| - state = ST_NON_WORD;
|
| - head += strlen(PATTERN[pattern_num][1]) - 1;
|
| - } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
|
| - ((line[actual][head] == '"') || (line[actual][head] == '\''))) {
|
| - quotmark = line[actual][head];
|
| - state = ST_ATTRIB;
|
| - }
|
| - break;
|
| - case ST_ATTRIB: // non word chars
|
| - prevstate = ST_ATTRIB;
|
| - if (line[actual][head] == quotmark) {
|
| - state = ST_TAG;
|
| - if (checkattr == 2) checkattr = 1;
|
| - // for IMG ALT
|
| - } else if (is_wordchar(line[actual] + head) && (checkattr == 2)) {
|
| - state = ST_WORD;
|
| - token = head;
|
| - } else if (line[actual][head] == '&') {
|
| - state = ST_CHAR_ENTITY;
|
| - }
|
| - break;
|
| - case ST_CHAR_ENTITY: // SGML element
|
| - if ((tolower(line[actual][head]) == ';')) {
|
| - state = prevstate;
|
| - head--;
|
| - }
|
| - }
|
| - if (next_char(line[actual], &head)) return NULL;
|
| - }
|
| -}
|
| +HTMLParser::~HTMLParser() {}
|
|
|