Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(221)

Side by Side Diff: third_party/hunspell_new/src/parsers/htmlparser.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #include <cstdlib>
2 #include <cstring>
3 #include <cstdio>
4 #include <ctype.h>
5
6 #include "../hunspell/csutil.hxx"
7 #include "htmlparser.hxx"
8
9
10 #ifndef W32
11 using namespace std;
12 #endif
13
14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
15
16 static const char * PATTERN[][2] = {
17 { "<script", "</script>" },
18 { "<style", "</style>" },
19 { "<code", "</code>" },
20 { "<samp", "</samp>" },
21 { "<kbd", "</kbd>" },
22 { "<var", "</var>" },
23 { "<listing", "</listing>" },
24 { "<address", "</address>" },
25 { "<pre", "</pre>" },
26 { "<!--", "-->" },
27 { "<[cdata[", "]]>" }, // XML comment
28 { "<", ">" }
29 };
30
31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2))
32
33 static const char * PATTERN2[][2] = {
34 { "<img", "alt=" }, // ALT and TITLE attrib handled spec.
35 { "<img", "title=" },
36 { "<a ", "title=" }
37 };
38
39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2))
40
41 HTMLParser::HTMLParser(const char * wordchars)
42 {
43 init(wordchars);
44 }
45
46 HTMLParser::HTMLParser(unsigned short * wordchars, int len)
47 {
48 init(wordchars, len);
49 }
50
51 HTMLParser::~HTMLParser()
52 {
53 }
54
55
56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
57 {
58 for (unsigned int i = 0; i < len; i++) {
59 char * j = line[actual] + head;
60 const char * k = p[i][column];
61 while ((*k != '\0') && (tolower(*j) == *k)) {
62 j++;
63 k++;
64 }
65 if (*k == '\0') return i;
66 }
67 return -1;
68 }
69
70 /*
71 * HTML parser
72 *
73 */
74
75
76 char * HTMLParser::next_token()
77 {
78 const char * latin1;
79
80 for (;;) {
81 //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[ actual]);
82 //getch();
83 switch (state)
84 {
85 case ST_NON_WORD: // non word chars
86 prevstate = ST_NON_WORD;
87 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0) ) != -1) {
88 checkattr = 0;
89 if ((pattern2_num = look_pattern(PATTERN2, PATTE RN_LEN2, 0)) != -1) {
90 checkattr = 1;
91 }
92 state = ST_TAG;
93 } else if (is_wordchar(line[actual] + head)) {
94 state = ST_WORD;
95 token = head;
96 } else if ((latin1 = get_latin1(line[actual] + head))) {
97 state = ST_WORD;
98 token = head;
99 head += strlen(latin1);
100 } else if (line[actual][head] == '&') {
101 state = ST_CHAR_ENTITY;
102 }
103 break;
104 case ST_WORD: // wordchar
105 if ((latin1 = get_latin1(line[actual] + head))) {
106 head += strlen(latin1);
107 } else if (! is_wordchar(line[actual] + head)) {
108 state = prevstate;
109 char * t = alloc_token(token, &head);
110 if (t) return t;
111 }
112 break;
113 case ST_TAG: // comment, labels, etc
114 int i;
115 if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT TERN_LEN2, 1)) != -1)
116 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num] [0]) == 0)) {
117 checkattr = 2;
118 } else if ((checkattr > 0) && (line[actual][head] == '>' )) {
119 state = ST_NON_WORD;
120 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
121 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) = = 0)) {
122 state = ST_NON_WORD;
123 head += strlen(PATTERN[pattern_num][1]) - 1;
124 } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
125 ((line[actual][head] == '"') || (line[actual][he ad] == '\''))) {
126 quotmark = line[actual][head];
127 state = ST_ATTRIB;
128 }
129 break;
130 case ST_ATTRIB: // non word chars
131 prevstate = ST_ATTRIB;
132 if (line[actual][head] == quotmark) {
133 state = ST_TAG;
134 if (checkattr == 2) checkattr = 1;
135 // for IMG ALT
136 } else if (is_wordchar(line[actual] + head) && (checkatt r == 2)) {
137 state = ST_WORD;
138 token = head;
139 } else if (line[actual][head] == '&') {
140 state = ST_CHAR_ENTITY;
141 }
142 break;
143 case ST_CHAR_ENTITY: // SGML element
144 if ((tolower(line[actual][head]) == ';')) {
145 state = prevstate;
146 head--;
147 }
148 }
149 if (next_char(line[actual], &head)) return NULL;
150 }
151 }
OLDNEW
« no previous file with comments | « third_party/hunspell_new/src/parsers/htmlparser.hxx ('k') | third_party/hunspell_new/src/parsers/latexparser.hxx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698