OLD | NEW |
| (Empty) |
1 #include <cstdlib> | |
2 #include <cstring> | |
3 #include <cstdio> | |
4 #include <ctype.h> | |
5 | |
6 #include "../hunspell/csutil.hxx" | |
7 #include "textparser.hxx" | |
8 | |
9 #ifndef W32 | |
10 using namespace std; | |
11 #endif | |
12 | |
13 // ISO-8859-1 HTML character entities | |
14 | |
15 static const char * LATIN1[] = { | |
16 "À", | |
17 "Ã", | |
18 "Å", | |
19 "Æ", | |
20 "È", | |
21 "Ê", | |
22 "Ì", | |
23 "Ï", | |
24 "Ð", | |
25 "Ñ", | |
26 "Ò", | |
27 "Ø", | |
28 "Ù", | |
29 "Þ", | |
30 "à", | |
31 "ã", | |
32 "å", | |
33 "æ", | |
34 "è", | |
35 "ê", | |
36 "ì", | |
37 "ï", | |
38 "ð", | |
39 "ñ", | |
40 "ò", | |
41 "ø", | |
42 "ù", | |
43 "þ", | |
44 "ÿ" | |
45 }; | |
46 | |
47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *)) | |
48 | |
49 TextParser::TextParser() { | |
50 init((char *) NULL); | |
51 } | |
52 | |
53 TextParser::TextParser(const char * wordchars) | |
54 { | |
55 init(wordchars); | |
56 } | |
57 | |
58 TextParser::TextParser(unsigned short * wordchars, int len) | |
59 { | |
60 init(wordchars, len); | |
61 } | |
62 | |
63 TextParser::~TextParser() | |
64 { | |
65 } | |
66 | |
67 int TextParser::is_wordchar(char * w) | |
68 { | |
69 if (*w == '\0') return 0; | |
70 if (utf8) { | |
71 w_char wc; | |
72 unsigned short idx; | |
73 u8_u16(&wc, 1, w); | |
74 idx = (wc.h << 8) + wc.l; | |
75 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(
wordchars_utf16, *((unsigned short *) &wc), wclen))); | |
76 } else { | |
77 return wordcharacters[(*w + 256) % 256]; | |
78 } | |
79 } | |
80 | |
81 const char * TextParser::get_latin1(char * s) | |
82 { | |
83 if (s[0] == '&') { | |
84 unsigned int i = 0; | |
85 while ((i < LATIN1_LEN) && | |
86 strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++; | |
87 if (i != LATIN1_LEN) return LATIN1[i]; | |
88 } | |
89 return NULL; | |
90 } | |
91 | |
92 void TextParser::init(const char * wordchars) | |
93 { | |
94 for (int i = 0; i < MAXPREVLINE; i++) { | |
95 line[i][0] = '\0'; | |
96 } | |
97 actual = 0; | |
98 head = 0; | |
99 token = 0; | |
100 state = 0; | |
101 utf8 = 0; | |
102 checkurl = 0; | |
103 unsigned int j; | |
104 for (j = 0; j < 256; j++) { | |
105 wordcharacters[j] = 0; | |
106 } | |
107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ
KLYXCVBNM"; | |
108 for (j = 0; j < strlen(wordchars); j++) { | |
109 wordcharacters[(wordchars[j] + 256) % 256] = 1; | |
110 } | |
111 } | |
112 | |
113 void TextParser::init(unsigned short * wc, int len) | |
114 { | |
115 for (int i = 0; i < MAXPREVLINE; i++) { | |
116 line[i][0] = '\0'; | |
117 } | |
118 actual = 0; | |
119 head = 0; | |
120 token = 0; | |
121 state = 0; | |
122 utf8 = 1; | |
123 checkurl = 0; | |
124 wordchars_utf16 = wc; | |
125 wclen = len; | |
126 } | |
127 | |
128 int TextParser::next_char(char * line, int * pos) { | |
129 if (*(line + *pos) == '\0') return 1; | |
130 if (utf8) { | |
131 if (*(line + *pos) >> 7) { | |
132 // jump to next UTF-8 character | |
133 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++); | |
134 } else { | |
135 (*pos)++; | |
136 } | |
137 } else (*pos)++; | |
138 return 0; | |
139 } | |
140 | |
141 void TextParser::put_line(char * word) | |
142 { | |
143 actual = (actual + 1) % MAXPREVLINE; | |
144 strcpy(line[actual], word); | |
145 token = 0; | |
146 head = 0; | |
147 check_urls(); | |
148 } | |
149 | |
150 char * TextParser::get_prevline(int n) | |
151 { | |
152 return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]); | |
153 } | |
154 | |
155 char * TextParser::get_line() | |
156 { | |
157 return get_prevline(0); | |
158 } | |
159 | |
160 char * TextParser::next_token() | |
161 { | |
162 const char * latin1; | |
163 | |
164 for (;;) { | |
165 switch (state) | |
166 { | |
167 case 0: // non word chars | |
168 if (is_wordchar(line[actual] + head)) { | |
169 state = 1; | |
170 token = head; | |
171 } else if ((latin1 = get_latin1(line[actual] + head))) { | |
172 state = 1; | |
173 token = head; | |
174 head += strlen(latin1); | |
175 } | |
176 break; | |
177 case 1: // wordchar | |
178 if ((latin1 = get_latin1(line[actual] + head))) { | |
179 head += strlen(latin1); | |
180 } else if (! is_wordchar(line[actual] + head)) { | |
181 state = 0; | |
182 char * t = alloc_token(token, &head); | |
183 if (t) return t; | |
184 } | |
185 break; | |
186 } | |
187 if (next_char(line[actual], &head)) return NULL; | |
188 } | |
189 } | |
190 | |
191 int TextParser::get_tokenpos() | |
192 { | |
193 return token; | |
194 } | |
195 | |
196 int TextParser::change_token(const char * word) | |
197 { | |
198 if (word) { | |
199 char * r = mystrdup(line[actual] + head); | |
200 strcpy(line[actual] + token, word); | |
201 strcat(line[actual], r); | |
202 head = token; | |
203 free(r); | |
204 return 1; | |
205 } | |
206 return 0; | |
207 } | |
208 | |
209 void TextParser::check_urls() | |
210 { | |
211 int url_state = 0; | |
212 int url_head = 0; | |
213 int url_token = 0; | |
214 int url = 0; | |
215 for (;;) { | |
216 switch (url_state) | |
217 { | |
218 case 0: // non word chars | |
219 if (is_wordchar(line[actual] + url_head)) { | |
220 url_state = 1; | |
221 url_token = url_head; | |
222 // Unix path | |
223 } else if (*(line[actual] + url_head) == '/') { | |
224 url_state = 1; | |
225 url_token = url_head; | |
226 url = 1; | |
227 } | |
228 break; | |
229 case 1: // wordchar | |
230 char ch = *(line[actual] + url_head); | |
231 // e-mail address | |
232 if ((ch == '@') || | |
233 // MS-DOS, Windows path | |
234 (strncmp(line[actual] + url_head, ":\\", 2) == 0) || | |
235 // URL | |
236 (strncmp(line[actual] + url_head, "://", 3) == 0)) { | |
237 url = 1; | |
238 } else if (! (is_wordchar(line[actual] + url_head) || | |
239 (ch == '-') || (ch == '_') || (ch == '\\') || | |
240 (ch == '.') || (ch == ':') || (ch == '/') || | |
241 (ch == '~') || (ch == '%') || (ch == '*') || | |
242 (ch == '$') || (ch == '[') || (ch == ']') || | |
243 (ch == '?') || (ch == '!') || | |
244 ((ch >= '0') && (ch <= '9')))) { | |
245 url_state = 0; | |
246 if (url == 1) { | |
247 for (int i = url_token; i < url_head; i+
+) { | |
248 *(urlline + i) = 1; | |
249 } | |
250 } | |
251 url = 0; | |
252 } | |
253 break; | |
254 } | |
255 *(urlline + url_head) = 0; | |
256 if (next_char(line[actual], &url_head)) return; | |
257 } | |
258 } | |
259 | |
260 int TextParser::get_url(int token_pos, int * head) | |
261 { | |
262 for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++); | |
263 return checkurl ? 0 : urlline[token_pos]; | |
264 } | |
265 | |
266 void TextParser::set_url_checking(int check) | |
267 { | |
268 checkurl = check; | |
269 } | |
270 | |
271 | |
272 char * TextParser::alloc_token(int token, int * head) | |
273 { | |
274 if (get_url(token, head)) return NULL; | |
275 char * t = (char *) malloc(*head - token + 1); | |
276 if (t) { | |
277 t[*head - token] = '\0'; | |
278 strncpy(t, line[actual] + token, *head - token); | |
279 // remove colon for Finnish and Swedish language | |
280 if (t[*head - token - 1] == ':') { | |
281 t[*head - token - 1] = '\0'; | |
282 if (!t[0]) { | |
283 free(t); | |
284 return NULL; | |
285 } | |
286 } | |
287 return t; | |
288 } | |
289 fprintf(stderr,"Error - Insufficient Memory\n"); | |
290 return NULL; | |
291 } | |
OLD | NEW |