| OLD | NEW |
| (Empty) |
| 1 #include <cstdlib> | |
| 2 #include <cstring> | |
| 3 #include <cstdio> | |
| 4 #include <ctype.h> | |
| 5 | |
| 6 #include "../hunspell/csutil.hxx" | |
| 7 #include "htmlparser.hxx" | |
| 8 | |
| 9 | |
| 10 #ifndef W32 | |
| 11 using namespace std; | |
| 12 #endif | |
| 13 | |
| 14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; | |
| 15 | |
| 16 static const char * PATTERN[][2] = { | |
| 17 { "<script", "</script>" }, | |
| 18 { "<style", "</style>" }, | |
| 19 { "<code", "</code>" }, | |
| 20 { "<samp", "</samp>" }, | |
| 21 { "<kbd", "</kbd>" }, | |
| 22 { "<var", "</var>" }, | |
| 23 { "<listing", "</listing>" }, | |
| 24 { "<address", "</address>" }, | |
| 25 { "<pre", "</pre>" }, | |
| 26 { "<!--", "-->" }, | |
| 27 { "<[cdata[", "]]>" }, // XML comment | |
| 28 { "<", ">" } | |
| 29 }; | |
| 30 | |
| 31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2)) | |
| 32 | |
| 33 static const char * PATTERN2[][2] = { | |
| 34 { "<img", "alt=" }, // ALT and TITLE attrib handled spec. | |
| 35 { "<img", "title=" }, | |
| 36 { "<a ", "title=" } | |
| 37 }; | |
| 38 | |
| 39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2)) | |
| 40 | |
| 41 HTMLParser::HTMLParser(const char * wordchars) | |
| 42 { | |
| 43 init(wordchars); | |
| 44 } | |
| 45 | |
| 46 HTMLParser::HTMLParser(unsigned short * wordchars, int len) | |
| 47 { | |
| 48 init(wordchars, len); | |
| 49 } | |
| 50 | |
| 51 HTMLParser::~HTMLParser() | |
| 52 { | |
| 53 } | |
| 54 | |
| 55 | |
| 56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column) | |
| 57 { | |
| 58 for (unsigned int i = 0; i < len; i++) { | |
| 59 char * j = line[actual] + head; | |
| 60 const char * k = p[i][column]; | |
| 61 while ((*k != '\0') && (tolower(*j) == *k)) { | |
| 62 j++; | |
| 63 k++; | |
| 64 } | |
| 65 if (*k == '\0') return i; | |
| 66 } | |
| 67 return -1; | |
| 68 } | |
| 69 | |
| 70 /* | |
| 71 * HTML parser | |
| 72 * | |
| 73 */ | |
| 74 | |
| 75 | |
| 76 char * HTMLParser::next_token() | |
| 77 { | |
| 78 const char * latin1; | |
| 79 | |
| 80 for (;;) { | |
| 81 //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[
actual]); | |
| 82 //getch(); | |
| 83 switch (state) | |
| 84 { | |
| 85 case ST_NON_WORD: // non word chars | |
| 86 prevstate = ST_NON_WORD; | |
| 87 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)
) != -1) { | |
| 88 checkattr = 0; | |
| 89 if ((pattern2_num = look_pattern(PATTERN2, PATTE
RN_LEN2, 0)) != -1) { | |
| 90 checkattr = 1; | |
| 91 } | |
| 92 state = ST_TAG; | |
| 93 } else if (is_wordchar(line[actual] + head)) { | |
| 94 state = ST_WORD; | |
| 95 token = head; | |
| 96 } else if ((latin1 = get_latin1(line[actual] + head))) { | |
| 97 state = ST_WORD; | |
| 98 token = head; | |
| 99 head += strlen(latin1); | |
| 100 } else if (line[actual][head] == '&') { | |
| 101 state = ST_CHAR_ENTITY; | |
| 102 } | |
| 103 break; | |
| 104 case ST_WORD: // wordchar | |
| 105 if ((latin1 = get_latin1(line[actual] + head))) { | |
| 106 head += strlen(latin1); | |
| 107 } else if (! is_wordchar(line[actual] + head)) { | |
| 108 state = prevstate; | |
| 109 char * t = alloc_token(token, &head); | |
| 110 if (t) return t; | |
| 111 } | |
| 112 break; | |
| 113 case ST_TAG: // comment, labels, etc | |
| 114 int i; | |
| 115 if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT
TERN_LEN2, 1)) != -1) | |
| 116 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num]
[0]) == 0)) { | |
| 117 checkattr = 2; | |
| 118 } else if ((checkattr > 0) && (line[actual][head] == '>'
)) { | |
| 119 state = ST_NON_WORD; | |
| 120 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1))
!= -1) && | |
| 121 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) =
= 0)) { | |
| 122 state = ST_NON_WORD; | |
| 123 head += strlen(PATTERN[pattern_num][1])
- 1; | |
| 124 } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0)
&& | |
| 125 ((line[actual][head] == '"') || (line[actual][he
ad] == '\''))) { | |
| 126 quotmark = line[actual][head]; | |
| 127 state = ST_ATTRIB; | |
| 128 } | |
| 129 break; | |
| 130 case ST_ATTRIB: // non word chars | |
| 131 prevstate = ST_ATTRIB; | |
| 132 if (line[actual][head] == quotmark) { | |
| 133 state = ST_TAG; | |
| 134 if (checkattr == 2) checkattr = 1; | |
| 135 // for IMG ALT | |
| 136 } else if (is_wordchar(line[actual] + head) && (checkatt
r == 2)) { | |
| 137 state = ST_WORD; | |
| 138 token = head; | |
| 139 } else if (line[actual][head] == '&') { | |
| 140 state = ST_CHAR_ENTITY; | |
| 141 } | |
| 142 break; | |
| 143 case ST_CHAR_ENTITY: // SGML element | |
| 144 if ((tolower(line[actual][head]) == ';')) { | |
| 145 state = prevstate; | |
| 146 head--; | |
| 147 } | |
| 148 } | |
| 149 if (next_char(line[actual], &head)) return NULL; | |
| 150 } | |
| 151 } | |
| OLD | NEW |