| OLD | NEW |
| (Empty) |
| 1 #include <cstdlib> | |
| 2 #include <cstring> | |
| 3 #include <cstdio> | |
| 4 #include <ctype.h> | |
| 5 | |
| 6 #include "../hunspell/csutil.hxx" | |
| 7 #include "textparser.hxx" | |
| 8 | |
| 9 #ifndef W32 | |
| 10 using namespace std; | |
| 11 #endif | |
| 12 | |
| 13 // ISO-8859-1 HTML character entities | |
| 14 | |
| 15 static const char * LATIN1[] = { | |
| 16 "À", | |
| 17 "Ã", | |
| 18 "Å", | |
| 19 "Æ", | |
| 20 "È", | |
| 21 "Ê", | |
| 22 "Ì", | |
| 23 "Ï", | |
| 24 "Ð", | |
| 25 "Ñ", | |
| 26 "Ò", | |
| 27 "Ø", | |
| 28 "Ù", | |
| 29 "Þ", | |
| 30 "à", | |
| 31 "ã", | |
| 32 "å", | |
| 33 "æ", | |
| 34 "è", | |
| 35 "ê", | |
| 36 "ì", | |
| 37 "ï", | |
| 38 "ð", | |
| 39 "ñ", | |
| 40 "ò", | |
| 41 "ø", | |
| 42 "ù", | |
| 43 "þ", | |
| 44 "ÿ" | |
| 45 }; | |
| 46 | |
| 47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *)) | |
| 48 | |
| 49 TextParser::TextParser() { | |
| 50 init((char *) NULL); | |
| 51 } | |
| 52 | |
| 53 TextParser::TextParser(const char * wordchars) | |
| 54 { | |
| 55 init(wordchars); | |
| 56 } | |
| 57 | |
| 58 TextParser::TextParser(unsigned short * wordchars, int len) | |
| 59 { | |
| 60 init(wordchars, len); | |
| 61 } | |
| 62 | |
| 63 TextParser::~TextParser() | |
| 64 { | |
| 65 } | |
| 66 | |
| 67 int TextParser::is_wordchar(char * w) | |
| 68 { | |
| 69 if (*w == '\0') return 0; | |
| 70 if (utf8) { | |
| 71 w_char wc; | |
| 72 unsigned short idx; | |
| 73 u8_u16(&wc, 1, w); | |
| 74 idx = (wc.h << 8) + wc.l; | |
| 75 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(
wordchars_utf16, *((unsigned short *) &wc), wclen))); | |
| 76 } else { | |
| 77 return wordcharacters[(*w + 256) % 256]; | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 const char * TextParser::get_latin1(char * s) | |
| 82 { | |
| 83 if (s[0] == '&') { | |
| 84 unsigned int i = 0; | |
| 85 while ((i < LATIN1_LEN) && | |
| 86 strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++; | |
| 87 if (i != LATIN1_LEN) return LATIN1[i]; | |
| 88 } | |
| 89 return NULL; | |
| 90 } | |
| 91 | |
| 92 void TextParser::init(const char * wordchars) | |
| 93 { | |
| 94 for (int i = 0; i < MAXPREVLINE; i++) { | |
| 95 line[i][0] = '\0'; | |
| 96 } | |
| 97 actual = 0; | |
| 98 head = 0; | |
| 99 token = 0; | |
| 100 state = 0; | |
| 101 utf8 = 0; | |
| 102 checkurl = 0; | |
| 103 unsigned int j; | |
| 104 for (j = 0; j < 256; j++) { | |
| 105 wordcharacters[j] = 0; | |
| 106 } | |
| 107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ
KLYXCVBNM"; | |
| 108 for (j = 0; j < strlen(wordchars); j++) { | |
| 109 wordcharacters[(wordchars[j] + 256) % 256] = 1; | |
| 110 } | |
| 111 } | |
| 112 | |
| 113 void TextParser::init(unsigned short * wc, int len) | |
| 114 { | |
| 115 for (int i = 0; i < MAXPREVLINE; i++) { | |
| 116 line[i][0] = '\0'; | |
| 117 } | |
| 118 actual = 0; | |
| 119 head = 0; | |
| 120 token = 0; | |
| 121 state = 0; | |
| 122 utf8 = 1; | |
| 123 checkurl = 0; | |
| 124 wordchars_utf16 = wc; | |
| 125 wclen = len; | |
| 126 } | |
| 127 | |
| 128 int TextParser::next_char(char * line, int * pos) { | |
| 129 if (*(line + *pos) == '\0') return 1; | |
| 130 if (utf8) { | |
| 131 if (*(line + *pos) >> 7) { | |
| 132 // jump to next UTF-8 character | |
| 133 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++); | |
| 134 } else { | |
| 135 (*pos)++; | |
| 136 } | |
| 137 } else (*pos)++; | |
| 138 return 0; | |
| 139 } | |
| 140 | |
| 141 void TextParser::put_line(char * word) | |
| 142 { | |
| 143 actual = (actual + 1) % MAXPREVLINE; | |
| 144 strcpy(line[actual], word); | |
| 145 token = 0; | |
| 146 head = 0; | |
| 147 check_urls(); | |
| 148 } | |
| 149 | |
| 150 char * TextParser::get_prevline(int n) | |
| 151 { | |
| 152 return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]); | |
| 153 } | |
| 154 | |
| 155 char * TextParser::get_line() | |
| 156 { | |
| 157 return get_prevline(0); | |
| 158 } | |
| 159 | |
| 160 char * TextParser::next_token() | |
| 161 { | |
| 162 const char * latin1; | |
| 163 | |
| 164 for (;;) { | |
| 165 switch (state) | |
| 166 { | |
| 167 case 0: // non word chars | |
| 168 if (is_wordchar(line[actual] + head)) { | |
| 169 state = 1; | |
| 170 token = head; | |
| 171 } else if ((latin1 = get_latin1(line[actual] + head))) { | |
| 172 state = 1; | |
| 173 token = head; | |
| 174 head += strlen(latin1); | |
| 175 } | |
| 176 break; | |
| 177 case 1: // wordchar | |
| 178 if ((latin1 = get_latin1(line[actual] + head))) { | |
| 179 head += strlen(latin1); | |
| 180 } else if (! is_wordchar(line[actual] + head)) { | |
| 181 state = 0; | |
| 182 char * t = alloc_token(token, &head); | |
| 183 if (t) return t; | |
| 184 } | |
| 185 break; | |
| 186 } | |
| 187 if (next_char(line[actual], &head)) return NULL; | |
| 188 } | |
| 189 } | |
| 190 | |
| 191 int TextParser::get_tokenpos() | |
| 192 { | |
| 193 return token; | |
| 194 } | |
| 195 | |
| 196 int TextParser::change_token(const char * word) | |
| 197 { | |
| 198 if (word) { | |
| 199 char * r = mystrdup(line[actual] + head); | |
| 200 strcpy(line[actual] + token, word); | |
| 201 strcat(line[actual], r); | |
| 202 head = token; | |
| 203 free(r); | |
| 204 return 1; | |
| 205 } | |
| 206 return 0; | |
| 207 } | |
| 208 | |
| 209 void TextParser::check_urls() | |
| 210 { | |
| 211 int url_state = 0; | |
| 212 int url_head = 0; | |
| 213 int url_token = 0; | |
| 214 int url = 0; | |
| 215 for (;;) { | |
| 216 switch (url_state) | |
| 217 { | |
| 218 case 0: // non word chars | |
| 219 if (is_wordchar(line[actual] + url_head)) { | |
| 220 url_state = 1; | |
| 221 url_token = url_head; | |
| 222 // Unix path | |
| 223 } else if (*(line[actual] + url_head) == '/') { | |
| 224 url_state = 1; | |
| 225 url_token = url_head; | |
| 226 url = 1; | |
| 227 } | |
| 228 break; | |
| 229 case 1: // wordchar | |
| 230 char ch = *(line[actual] + url_head); | |
| 231 // e-mail address | |
| 232 if ((ch == '@') || | |
| 233 // MS-DOS, Windows path | |
| 234 (strncmp(line[actual] + url_head, ":\\", 2) == 0) || | |
| 235 // URL | |
| 236 (strncmp(line[actual] + url_head, "://", 3) == 0)) { | |
| 237 url = 1; | |
| 238 } else if (! (is_wordchar(line[actual] + url_head) || | |
| 239 (ch == '-') || (ch == '_') || (ch == '\\') || | |
| 240 (ch == '.') || (ch == ':') || (ch == '/') || | |
| 241 (ch == '~') || (ch == '%') || (ch == '*') || | |
| 242 (ch == '$') || (ch == '[') || (ch == ']') || | |
| 243 (ch == '?') || (ch == '!') || | |
| 244 ((ch >= '0') && (ch <= '9')))) { | |
| 245 url_state = 0; | |
| 246 if (url == 1) { | |
| 247 for (int i = url_token; i < url_head; i+
+) { | |
| 248 *(urlline + i) = 1; | |
| 249 } | |
| 250 } | |
| 251 url = 0; | |
| 252 } | |
| 253 break; | |
| 254 } | |
| 255 *(urlline + url_head) = 0; | |
| 256 if (next_char(line[actual], &url_head)) return; | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 int TextParser::get_url(int token_pos, int * head) | |
| 261 { | |
| 262 for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++); | |
| 263 return checkurl ? 0 : urlline[token_pos]; | |
| 264 } | |
| 265 | |
| 266 void TextParser::set_url_checking(int check) | |
| 267 { | |
| 268 checkurl = check; | |
| 269 } | |
| 270 | |
| 271 | |
| 272 char * TextParser::alloc_token(int token, int * head) | |
| 273 { | |
| 274 if (get_url(token, head)) return NULL; | |
| 275 char * t = (char *) malloc(*head - token + 1); | |
| 276 if (t) { | |
| 277 t[*head - token] = '\0'; | |
| 278 strncpy(t, line[actual] + token, *head - token); | |
| 279 // remove colon for Finnish and Swedish language | |
| 280 if (t[*head - token - 1] == ':') { | |
| 281 t[*head - token - 1] = '\0'; | |
| 282 if (!t[0]) { | |
| 283 free(t); | |
| 284 return NULL; | |
| 285 } | |
| 286 } | |
| 287 return t; | |
| 288 } | |
| 289 fprintf(stderr,"Error - Insufficient Memory\n"); | |
| 290 return NULL; | |
| 291 } | |
| OLD | NEW |