OLD | NEW |
| (Empty) |
1 #include <cstdlib> | |
2 #include <cstring> | |
3 #include <cstdio> | |
4 #include <ctype.h> | |
5 | |
6 #include "../hunspell/csutil.hxx" | |
7 #include "htmlparser.hxx" | |
8 | |
9 | |
10 #ifndef W32 | |
11 using namespace std; | |
12 #endif | |
13 | |
14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; | |
15 | |
16 static const char * PATTERN[][2] = { | |
17 { "<script", "</script>" }, | |
18 { "<style", "</style>" }, | |
19 { "<code", "</code>" }, | |
20 { "<samp", "</samp>" }, | |
21 { "<kbd", "</kbd>" }, | |
22 { "<var", "</var>" }, | |
23 { "<listing", "</listing>" }, | |
24 { "<address", "</address>" }, | |
25 { "<pre", "</pre>" }, | |
26 { "<!--", "-->" }, | |
27 { "<[cdata[", "]]>" }, // XML comment | |
28 { "<", ">" } | |
29 }; | |
30 | |
31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2)) | |
32 | |
33 static const char * PATTERN2[][2] = { | |
34 { "<img", "alt=" }, // ALT and TITLE attrib handled spec. | |
35 { "<img", "title=" }, | |
36 { "<a ", "title=" } | |
37 }; | |
38 | |
39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2)) | |
40 | |
41 HTMLParser::HTMLParser(const char * wordchars) | |
42 { | |
43 init(wordchars); | |
44 } | |
45 | |
46 HTMLParser::HTMLParser(unsigned short * wordchars, int len) | |
47 { | |
48 init(wordchars, len); | |
49 } | |
50 | |
51 HTMLParser::~HTMLParser() | |
52 { | |
53 } | |
54 | |
55 | |
56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column) | |
57 { | |
58 for (unsigned int i = 0; i < len; i++) { | |
59 char * j = line[actual] + head; | |
60 const char * k = p[i][column]; | |
61 while ((*k != '\0') && (tolower(*j) == *k)) { | |
62 j++; | |
63 k++; | |
64 } | |
65 if (*k == '\0') return i; | |
66 } | |
67 return -1; | |
68 } | |
69 | |
70 /* | |
71 * HTML parser | |
72 * | |
73 */ | |
74 | |
75 | |
76 char * HTMLParser::next_token() | |
77 { | |
78 const char * latin1; | |
79 | |
80 for (;;) { | |
81 //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[
actual]); | |
82 //getch(); | |
83 switch (state) | |
84 { | |
85 case ST_NON_WORD: // non word chars | |
86 prevstate = ST_NON_WORD; | |
87 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)
) != -1) { | |
88 checkattr = 0; | |
89 if ((pattern2_num = look_pattern(PATTERN2, PATTE
RN_LEN2, 0)) != -1) { | |
90 checkattr = 1; | |
91 } | |
92 state = ST_TAG; | |
93 } else if (is_wordchar(line[actual] + head)) { | |
94 state = ST_WORD; | |
95 token = head; | |
96 } else if ((latin1 = get_latin1(line[actual] + head))) { | |
97 state = ST_WORD; | |
98 token = head; | |
99 head += strlen(latin1); | |
100 } else if (line[actual][head] == '&') { | |
101 state = ST_CHAR_ENTITY; | |
102 } | |
103 break; | |
104 case ST_WORD: // wordchar | |
105 if ((latin1 = get_latin1(line[actual] + head))) { | |
106 head += strlen(latin1); | |
107 } else if (! is_wordchar(line[actual] + head)) { | |
108 state = prevstate; | |
109 char * t = alloc_token(token, &head); | |
110 if (t) return t; | |
111 } | |
112 break; | |
113 case ST_TAG: // comment, labels, etc | |
114 int i; | |
115 if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT
TERN_LEN2, 1)) != -1) | |
116 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num]
[0]) == 0)) { | |
117 checkattr = 2; | |
118 } else if ((checkattr > 0) && (line[actual][head] == '>'
)) { | |
119 state = ST_NON_WORD; | |
120 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1))
!= -1) && | |
121 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) =
= 0)) { | |
122 state = ST_NON_WORD; | |
123 head += strlen(PATTERN[pattern_num][1])
- 1; | |
124 } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0)
&& | |
125 ((line[actual][head] == '"') || (line[actual][he
ad] == '\''))) { | |
126 quotmark = line[actual][head]; | |
127 state = ST_ATTRIB; | |
128 } | |
129 break; | |
130 case ST_ATTRIB: // non word chars | |
131 prevstate = ST_ATTRIB; | |
132 if (line[actual][head] == quotmark) { | |
133 state = ST_TAG; | |
134 if (checkattr == 2) checkattr = 1; | |
135 // for IMG ALT | |
136 } else if (is_wordchar(line[actual] + head) && (checkatt
r == 2)) { | |
137 state = ST_WORD; | |
138 token = head; | |
139 } else if (line[actual][head] == '&') { | |
140 state = ST_CHAR_ENTITY; | |
141 } | |
142 break; | |
143 case ST_CHAR_ENTITY: // SGML element | |
144 if ((tolower(line[actual][head]) == ';')) { | |
145 state = prevstate; | |
146 head--; | |
147 } | |
148 } | |
149 if (next_char(line[actual], &head)) return NULL; | |
150 } | |
151 } | |
OLD | NEW |