Index: third_party/hunspell_new/src/parsers/htmlparser.cxx |
diff --git a/third_party/hunspell_new/src/parsers/htmlparser.cxx b/third_party/hunspell_new/src/parsers/htmlparser.cxx |
deleted file mode 100644 |
index 341be4e8948b0aee66621b301b6a17a6341ca271..0000000000000000000000000000000000000000 |
--- a/third_party/hunspell_new/src/parsers/htmlparser.cxx |
+++ /dev/null |
@@ -1,151 +0,0 @@ |
-#include <cstdlib> |
-#include <cstring> |
-#include <cstdio> |
-#include <ctype.h> |
- |
-#include "../hunspell/csutil.hxx" |
-#include "htmlparser.hxx" |
- |
- |
-#ifndef W32 |
-using namespace std; |
-#endif |
- |
-enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; |
- |
-static const char * PATTERN[][2] = { |
- { "<script", "</script>" }, |
- { "<style", "</style>" }, |
- { "<code", "</code>" }, |
- { "<samp", "</samp>" }, |
- { "<kbd", "</kbd>" }, |
- { "<var", "</var>" }, |
- { "<listing", "</listing>" }, |
- { "<address", "</address>" }, |
- { "<pre", "</pre>" }, |
- { "<!--", "-->" }, |
- { "<[cdata[", "]]>" }, // XML comment |
- { "<", ">" } |
-}; |
- |
-#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2)) |
- |
-static const char * PATTERN2[][2] = { |
- { "<img", "alt=" }, // ALT and TITLE attrib handled spec. |
- { "<img", "title=" }, |
- { "<a ", "title=" } |
-}; |
- |
-#define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2)) |
- |
-HTMLParser::HTMLParser(const char * wordchars) |
-{ |
- init(wordchars); |
-} |
- |
-HTMLParser::HTMLParser(unsigned short * wordchars, int len) |
-{ |
- init(wordchars, len); |
-} |
- |
-HTMLParser::~HTMLParser() |
-{ |
-} |
- |
- |
-int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column) |
-{ |
- for (unsigned int i = 0; i < len; i++) { |
- char * j = line[actual] + head; |
- const char * k = p[i][column]; |
- while ((*k != '\0') && (tolower(*j) == *k)) { |
- j++; |
- k++; |
- } |
- if (*k == '\0') return i; |
- } |
- return -1; |
-} |
- |
-/* |
- * HTML parser |
- * |
- */ |
- |
- |
-char * HTMLParser::next_token() |
-{ |
- const char * latin1; |
- |
- for (;;) { |
- //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[actual]); |
- //getch(); |
- switch (state) |
- { |
- case ST_NON_WORD: // non word chars |
- prevstate = ST_NON_WORD; |
- if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) { |
- checkattr = 0; |
- if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) { |
- checkattr = 1; |
- } |
- state = ST_TAG; |
- } else if (is_wordchar(line[actual] + head)) { |
- state = ST_WORD; |
- token = head; |
- } else if ((latin1 = get_latin1(line[actual] + head))) { |
- state = ST_WORD; |
- token = head; |
- head += strlen(latin1); |
- } else if (line[actual][head] == '&') { |
- state = ST_CHAR_ENTITY; |
- } |
- break; |
- case ST_WORD: // wordchar |
- if ((latin1 = get_latin1(line[actual] + head))) { |
- head += strlen(latin1); |
- } else if (! is_wordchar(line[actual] + head)) { |
- state = prevstate; |
- char * t = alloc_token(token, &head); |
- if (t) return t; |
- } |
- break; |
- case ST_TAG: // comment, labels, etc |
- int i; |
- if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) |
- && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) { |
- checkattr = 2; |
- } else if ((checkattr > 0) && (line[actual][head] == '>')) { |
- state = ST_NON_WORD; |
- } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) && |
- (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) { |
- state = ST_NON_WORD; |
- head += strlen(PATTERN[pattern_num][1]) - 1; |
- } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) && |
- ((line[actual][head] == '"') || (line[actual][head] == '\''))) { |
- quotmark = line[actual][head]; |
- state = ST_ATTRIB; |
- } |
- break; |
- case ST_ATTRIB: // non word chars |
- prevstate = ST_ATTRIB; |
- if (line[actual][head] == quotmark) { |
- state = ST_TAG; |
- if (checkattr == 2) checkattr = 1; |
- // for IMG ALT |
- } else if (is_wordchar(line[actual] + head) && (checkattr == 2)) { |
- state = ST_WORD; |
- token = head; |
- } else if (line[actual][head] == '&') { |
- state = ST_CHAR_ENTITY; |
- } |
- break; |
- case ST_CHAR_ENTITY: // SGML element |
- if ((tolower(line[actual][head]) == ';')) { |
- state = prevstate; |
- head--; |
- } |
- } |
- if (next_char(line[actual], &head)) return NULL; |
- } |
-} |