| OLD | NEW |
| 1 /* ***** BEGIN LICENSE BLOCK ***** |
| 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| 3 * |
| 4 * The contents of this file are subject to the Mozilla Public License Version |
| 5 * 1.1 (the "License"); you may not use this file except in compliance with |
| 6 * the License. You may obtain a copy of the License at |
| 7 * http://www.mozilla.org/MPL/ |
| 8 * |
| 9 * Software distributed under the License is distributed on an "AS IS" basis, |
| 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 11 * for the specific language governing rights and limitations under the |
| 12 * License. |
| 13 * |
| 14 * The Original Code is Hunspell, based on MySpell. |
| 15 * |
| 16 * The Initial Developers of the Original Code are |
| 17 * Kevin Hendricks (MySpell) and Németh László (Hunspell). |
| 18 * Portions created by the Initial Developers are Copyright (C) 2002-2005 |
| 19 * the Initial Developers. All Rights Reserved. |
| 20 * |
| 21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
| 22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
| 23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
| 24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
| 25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
| 26 * |
| 27 * Alternatively, the contents of this file may be used under the terms of |
| 28 * either the GNU General Public License Version 2 or later (the "GPL"), or |
| 29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| 30 * in which case the provisions of the GPL or the LGPL are applicable instead |
| 31 * of those above. If you wish to allow use of your version of this file only |
| 32 * under the terms of either the GPL or the LGPL, and not to allow others to |
| 33 * use your version of this file under the terms of the MPL, indicate your |
| 34 * decision by deleting the provisions above and replace them with the notice |
| 35 * and other provisions required by the GPL or the LGPL. If you do not delete |
| 36 * the provisions above, a recipient may use your version of this file under |
| 37 * the terms of any one of the MPL, the GPL or the LGPL. |
| 38 * |
| 39 * ***** END LICENSE BLOCK ***** */ |
| 40 |
| 1 #include <cstdlib> | 41 #include <cstdlib> |
| 2 #include <cstring> | 42 #include <cstring> |
| 3 #include <cstdio> | 43 #include <cstdio> |
| 4 #include <ctype.h> | 44 #include <ctype.h> |
| 5 | 45 |
| 6 #include "../hunspell/csutil.hxx" | 46 #include "../hunspell/csutil.hxx" |
| 7 #include "textparser.hxx" | 47 #include "textparser.hxx" |
| 8 | 48 |
| 49 #include <algorithm> |
| 50 |
| 9 #ifndef W32 | 51 #ifndef W32 |
| 10 using namespace std; | 52 using namespace std; |
| 11 #endif | 53 #endif |
| 12 | 54 |
| 13 // ISO-8859-1 HTML character entities | 55 // ISO-8859-1 HTML character entities |
| 14 | 56 |
| 15 static const char * LATIN1[] = { | 57 static const char* LATIN1[] = { |
| 16 » "À", | 58 "À", "Ã", "Å", "Æ", "È", "Ê", |
| 17 » "Ã", | 59 "Ì", "Ï", "Ð", "Ñ", "Ò", "Ø", |
| 18 » "Å", | 60 "Ù", "Þ", "à", "ã", "å", "æ", |
| 19 » "Æ", | 61 "è", "ê", "ì", "ï", "ð", "ñ", |
| 20 » "È", | 62 "ò", "ø", "ù", "þ", "ÿ"}; |
| 21 » "Ê", | 63 |
| 22 » "Ì", | 64 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*)) |
| 23 » "Ï", | 65 |
| 24 » "Ð", | 66 #define ENTITY_APOS "'" |
| 25 » "Ñ", | 67 #define UTF8_APOS "\xe2\x80\x99" |
| 26 » "Ò", | 68 #define APOSTROPHE "'" |
| 27 » "Ø", | 69 |
| 28 » "Ù", | 70 TextParser::TextParser(const char* wordchars) { |
| 29 » "Þ", | 71 init(wordchars); |
| 30 » "à", | 72 } |
| 31 » "ã", | 73 |
| 32 » "å", | 74 TextParser::TextParser(const w_char* wordchars, int len) { |
| 33 » "æ", | 75 init(wordchars, len); |
| 34 » "è", | 76 } |
| 35 » "ê", | 77 |
| 36 » "ì", | 78 TextParser::~TextParser() {} |
| 37 » "ï", | 79 |
| 38 » "ð", | 80 int TextParser::is_wordchar(const char* w) { |
| 39 » "ñ", | 81 if (*w == '\0') |
| 40 » "ò", | 82 return 0; |
| 41 » "ø", | 83 if (utf8) { |
| 42 » "ù", | 84 std::vector<w_char> wc; |
| 43 » "þ", | 85 unsigned short idx; |
| 44 » "ÿ" | 86 u8_u16(wc, w); |
| 45 }; | 87 if (wc.empty()) |
| 46 | 88 return 0; |
| 47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *)) | 89 idx = (wc[0].h << 8) + wc[0].l; |
| 48 | 90 return (unicodeisalpha(idx) || |
| 49 TextParser::TextParser() { | 91 (wordchars_utf16 && |
| 50 » init((char *) NULL); | 92 std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0])
)); |
| 51 } | 93 } else { |
| 52 | 94 return wordcharacters[(*w + 256) % 256]; |
| 53 TextParser::TextParser(const char * wordchars) | 95 } |
| 54 { | 96 } |
| 55 » init(wordchars); | 97 |
| 56 } | 98 const char* TextParser::get_latin1(const char* s) { |
| 57 | 99 if (s[0] == '&') { |
| 58 TextParser::TextParser(unsigned short * wordchars, int len) | 100 unsigned int i = 0; |
| 59 { | 101 while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i]))) |
| 60 » init(wordchars, len); | 102 i++; |
| 61 } | 103 if (i != LATIN1_LEN) |
| 62 | 104 return LATIN1[i]; |
| 63 TextParser::~TextParser() | 105 } |
| 64 { | 106 return NULL; |
| 65 } | 107 } |
| 66 | 108 |
| 67 int TextParser::is_wordchar(char * w) | 109 void TextParser::init(const char* wordchars) { |
| 68 { | 110 actual = 0; |
| 69 if (*w == '\0') return 0; | 111 head = 0; |
| 70 » if (utf8) { | 112 token = 0; |
| 71 w_char wc; | 113 state = 0; |
| 72 unsigned short idx; | 114 utf8 = 0; |
| 73 » » u8_u16(&wc, 1, w); | 115 checkurl = 0; |
| 74 idx = (wc.h << 8) + wc.l; | 116 wordchars_utf16 = NULL; |
| 75 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(
wordchars_utf16, *((unsigned short *) &wc), wclen))); | 117 wclen = 0; |
| 76 } else { | 118 unsigned int j; |
| 77 » » return wordcharacters[(*w + 256) % 256]; | 119 for (j = 0; j < 256; j++) { |
| 78 » } | 120 wordcharacters[j] = 0; |
| 79 } | 121 } |
| 80 | 122 if (!wordchars) |
| 81 const char * TextParser::get_latin1(char * s) | 123 wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"; |
| 82 { | 124 for (j = 0; j < strlen(wordchars); j++) { |
| 83 » if (s[0] == '&') { | 125 wordcharacters[(wordchars[j] + 256) % 256] = 1; |
| 84 » » unsigned int i = 0; | 126 } |
| 85 » » while ((i < LATIN1_LEN) && | 127 } |
| 86 » » » strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++; | 128 |
| 87 » » if (i != LATIN1_LEN) return LATIN1[i]; | 129 void TextParser::init(const w_char* wc, int len) { |
| 88 » } | 130 actual = 0; |
| 89 » return NULL; | 131 head = 0; |
| 90 } | 132 token = 0; |
| 91 | 133 state = 0; |
| 92 void TextParser::init(const char * wordchars) | 134 utf8 = 1; |
| 93 { | 135 checkurl = 0; |
| 94 » for (int i = 0; i < MAXPREVLINE; i++) { | 136 wordchars_utf16 = wc; |
| 95 » » line[i][0] = '\0'; | 137 wclen = len; |
| 96 » } | 138 } |
| 97 » actual = 0; | 139 |
| 98 » head = 0; | 140 int TextParser::next_char(const char* ln, size_t* pos) { |
| 99 » token = 0; | 141 if (*(ln + *pos) == '\0') |
| 100 » state = 0; | 142 return 1; |
| 101 utf8 = 0; | 143 if (utf8) { |
| 102 checkurl = 0; | 144 if (*(ln + *pos) >> 7) { |
| 103 » unsigned int j; | 145 // jump to next UTF-8 character |
| 104 » for (j = 0; j < 256; j++) { | 146 for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++) |
| 105 » » wordcharacters[j] = 0; | 147 ; |
| 106 » } | 148 } else { |
| 107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ
KLYXCVBNM"; | 149 (*pos)++; |
| 108 » for (j = 0; j < strlen(wordchars); j++) { | 150 } |
| 109 » » wordcharacters[(wordchars[j] + 256) % 256] = 1; | 151 } else |
| 110 » } | 152 (*pos)++; |
| 111 } | 153 return 0; |
| 112 | 154 } |
| 113 void TextParser::init(unsigned short * wc, int len) | 155 |
| 114 { | 156 void TextParser::put_line(const char* word) { |
| 115 » for (int i = 0; i < MAXPREVLINE; i++) { | 157 actual = (actual + 1) % MAXPREVLINE; |
| 116 » » line[i][0] = '\0'; | 158 line[actual].assign(word); |
| 117 » } | 159 token = 0; |
| 118 » actual = 0; | 160 head = 0; |
| 119 » head = 0; | 161 check_urls(); |
| 120 » token = 0; | 162 } |
| 121 » state = 0; | 163 |
| 122 » utf8 = 1; | 164 std::string TextParser::get_prevline(int n) const { |
| 123 » checkurl = 0; | 165 return line[(actual + MAXPREVLINE - n) % MAXPREVLINE]; |
| 124 wordchars_utf16 = wc; | 166 } |
| 125 wclen = len; | 167 |
| 126 } | 168 std::string TextParser::get_line() const { |
| 127 | 169 return get_prevline(0); |
| 128 int TextParser::next_char(char * line, int * pos) { | 170 } |
| 129 if (*(line + *pos) == '\0') return 1; | 171 |
| 130 » if (utf8) { | 172 bool TextParser::next_token(std::string &t) { |
| 131 if (*(line + *pos) >> 7) { | 173 const char* latin1; |
| 132 // jump to next UTF-8 character | 174 |
| 133 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++); | 175 for (;;) { |
| 134 } else { | 176 switch (state) { |
| 135 (*pos)++; | 177 case 0: // non word chars |
| 178 if (is_wordchar(line[actual].c_str() + head)) { |
| 179 state = 1; |
| 180 token = head; |
| 181 } else if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
| 182 state = 1; |
| 183 token = head; |
| 184 head += strlen(latin1); |
| 185 } |
| 186 break; |
| 187 case 1: // wordchar |
| 188 if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
| 189 head += strlen(latin1); |
| 190 } else if ((is_wordchar((char*)APOSTROPHE) || |
| 191 (is_utf8() && is_wordchar((char*)UTF8_APOS))) && |
| 192 !line[actual].empty() && line[actual][head] == '\'' && |
| 193 is_wordchar(line[actual].c_str() + head + 1)) { |
| 194 head++; |
| 195 } else if (is_utf8() && |
| 196 is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe |
| 197 // to the WORDCHARS, if |
| 198 // needed |
| 199 strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_A
POS)) == |
| 200 0 && |
| 201 is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS)))
{ |
| 202 head += strlen(UTF8_APOS) - 1; |
| 203 } else if (!is_wordchar(line[actual].c_str() + head)) { |
| 204 state = 0; |
| 205 if (alloc_token(token, &head, t)) |
| 206 return true; |
| 207 } |
| 208 break; |
| 209 } |
| 210 if (next_char(line[actual].c_str(), &head)) |
| 211 return false; |
| 212 } |
| 213 } |
| 214 |
| 215 size_t TextParser::get_tokenpos() { |
| 216 return token; |
| 217 } |
| 218 |
| 219 int TextParser::change_token(const char* word) { |
| 220 if (word) { |
| 221 std::string remainder(line[actual].substr(head)); |
| 222 line[actual].resize(token); |
| 223 line[actual].append(word); |
| 224 line[actual].append(remainder); |
| 225 head = token; |
| 226 return 1; |
| 227 } |
| 228 return 0; |
| 229 } |
| 230 |
| 231 void TextParser::check_urls() { |
| 232 urlline.resize(line[actual].size() + 1); |
| 233 int url_state = 0; |
| 234 size_t url_head = 0; |
| 235 size_t url_token = 0; |
| 236 int url = 0; |
| 237 for (;;) { |
| 238 switch (url_state) { |
| 239 case 0: // non word chars |
| 240 if (is_wordchar(line[actual].c_str() + url_head)) { |
| 241 url_state = 1; |
| 242 url_token = url_head; |
| 243 // Unix path |
| 244 } else if (line[actual][url_head] == '/') { |
| 245 url_state = 1; |
| 246 url_token = url_head; |
| 247 url = 1; |
| 248 } |
| 249 break; |
| 250 case 1: // wordchar |
| 251 char ch = line[actual][url_head]; |
| 252 // e-mail address |
| 253 if ((ch == '@') || |
| 254 // MS-DOS, Windows path |
| 255 (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) || |
| 256 // URL |
| 257 (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) { |
| 258 url = 1; |
| 259 } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-')
|| |
| 260 (ch == '_') || (ch == '\\') || (ch == '.') || |
| 261 (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') || |
| 262 (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') || |
| 263 (ch == '?') || (ch == '!') || |
| 264 ((ch >= '0') && (ch <= '9')))) { |
| 265 url_state = 0; |
| 266 if (url == 1) { |
| 267 for (size_t i = url_token; i < url_head; ++i) { |
| 268 urlline[i] = true; |
| 136 } | 269 } |
| 137 } else (*pos)++; | 270 } |
| 138 return 0; | 271 url = 0; |
| 139 } | 272 } |
| 140 | 273 break; |
| 141 void TextParser::put_line(char * word) | 274 } |
| 142 { | 275 urlline[url_head] = false; |
| 143 » actual = (actual + 1) % MAXPREVLINE; | 276 if (next_char(line[actual].c_str(), &url_head)) |
| 144 » strcpy(line[actual], word); | 277 return; |
| 145 » token = 0; | 278 } |
| 146 » head = 0; | 279 } |
| 147 » check_urls(); | 280 |
| 148 } | 281 int TextParser::get_url(size_t token_pos, size_t* hd) { |
| 149 | 282 for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++) |
| 150 char * TextParser::get_prevline(int n) | 283 ; |
| 151 { | 284 return checkurl ? 0 : urlline[token_pos]; |
| 152 » return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]); | 285 } |
| 153 } | 286 |
| 154 | 287 void TextParser::set_url_checking(int check) { |
| 155 char * TextParser::get_line() | 288 checkurl = check; |
| 156 { | 289 } |
| 157 » return get_prevline(0); | 290 |
| 158 } | 291 bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) { |
| 159 | 292 size_t url_head = *hd; |
| 160 char * TextParser::next_token() | 293 if (get_url(tokn, &url_head)) |
| 161 { | 294 return false; |
| 162 » const char * latin1; | 295 t = line[actual].substr(tokn, *hd - tokn); |
| 163 » | 296 // remove colon for Finnish and Swedish language |
| 164 » for (;;) { | 297 if (!t.empty() && t[t.size() - 1] == ':') { |
| 165 » » switch (state) | 298 t.resize(t.size() - 1); |
| 166 » » { | 299 if (t.empty()) { |
| 167 » » case 0: // non word chars | 300 return false; |
| 168 » » » if (is_wordchar(line[actual] + head)) { | 301 } |
| 169 » » » » state = 1; | 302 } |
| 170 » » » » token = head; | 303 return true; |
| 171 » » » } else if ((latin1 = get_latin1(line[actual] + head))) { | 304 } |
| 172 » » » » state = 1; | |
| 173 » » » » token = head; | |
| 174 » » » » head += strlen(latin1); | |
| 175 » » » } | |
| 176 » » » break; | |
| 177 » » case 1: // wordchar | |
| 178 » » » if ((latin1 = get_latin1(line[actual] + head))) { | |
| 179 » » » » head += strlen(latin1); | |
| 180 » » » } else if (! is_wordchar(line[actual] + head)) { | |
| 181 » » » » state = 0; | |
| 182 » » » » char * t = alloc_token(token, &head); | |
| 183 » » » » if (t) return t; | |
| 184 » » » } | |
| 185 » » » break; | |
| 186 » » } | |
| 187 if (next_char(line[actual], &head)) return NULL; | |
| 188 » } | |
| 189 } | |
| 190 | |
| 191 int TextParser::get_tokenpos() | |
| 192 { | |
| 193 » return token; | |
| 194 } | |
| 195 | |
| 196 int TextParser::change_token(const char * word) | |
| 197 { | |
| 198 » if (word) { | |
| 199 » » char * r = mystrdup(line[actual] + head); | |
| 200 » » strcpy(line[actual] + token, word); | |
| 201 » » strcat(line[actual], r); | |
| 202 » » head = token; | |
| 203 » » free(r); | |
| 204 » » return 1; | |
| 205 » } | |
| 206 » return 0; | |
| 207 } | |
| 208 | |
| 209 void TextParser::check_urls() | |
| 210 { | |
| 211 » int url_state = 0; | |
| 212 » int url_head = 0; | |
| 213 » int url_token = 0; | |
| 214 » int url = 0; | |
| 215 » for (;;) { | |
| 216 » » switch (url_state) | |
| 217 » » { | |
| 218 » » case 0: // non word chars | |
| 219 » » » if (is_wordchar(line[actual] + url_head)) { | |
| 220 » » » » url_state = 1; | |
| 221 » » » » url_token = url_head; | |
| 222 » » » // Unix path | |
| 223 » » » } else if (*(line[actual] + url_head) == '/') { | |
| 224 » » » » url_state = 1; | |
| 225 » » » » url_token = url_head; | |
| 226 » » » » url = 1; | |
| 227 » » » } | |
| 228 » » » break; | |
| 229 » » case 1: // wordchar | |
| 230 » » » char ch = *(line[actual] + url_head); | |
| 231 » » » // e-mail address | |
| 232 » » » if ((ch == '@') || | |
| 233 » » » // MS-DOS, Windows path | |
| 234 » » » (strncmp(line[actual] + url_head, ":\\", 2) == 0) || | |
| 235 » » » // URL | |
| 236 » » » (strncmp(line[actual] + url_head, "://", 3) == 0)) { | |
| 237 » » » » url = 1; | |
| 238 » » » } else if (! (is_wordchar(line[actual] + url_head) || | |
| 239 » » » (ch == '-') || (ch == '_') || (ch == '\\') || | |
| 240 » » » (ch == '.') || (ch == ':') || (ch == '/') || | |
| 241 » » » (ch == '~') || (ch == '%') || (ch == '*') || | |
| 242 » » » (ch == '$') || (ch == '[') || (ch == ']') || | |
| 243 » » » (ch == '?') || (ch == '!') || | |
| 244 » » » ((ch >= '0') && (ch <= '9')))) { | |
| 245 » » » » url_state = 0; | |
| 246 » » » » if (url == 1) { | |
| 247 » » » » » for (int i = url_token; i < url_head; i+
+) { | |
| 248 » » » » » » *(urlline + i) = 1; | |
| 249 » » » » » } | |
| 250 » » » » } | |
| 251 » » » » url = 0; | |
| 252 » » » } | |
| 253 » » » break; | |
| 254 » » } | |
| 255 » » *(urlline + url_head) = 0; | |
| 256 if (next_char(line[actual], &url_head)) return; | |
| 257 » } | |
| 258 } | |
| 259 | |
| 260 int TextParser::get_url(int token_pos, int * head) | |
| 261 { | |
| 262 » for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++); | |
| 263 » return checkurl ? 0 : urlline[token_pos]; | |
| 264 } | |
| 265 | |
| 266 void TextParser::set_url_checking(int check) | |
| 267 { | |
| 268 » checkurl = check; | |
| 269 } | |
| 270 | |
| 271 | |
| 272 char * TextParser::alloc_token(int token, int * head) | |
| 273 { | |
| 274 if (get_url(token, head)) return NULL; | |
| 275 char * t = (char *) malloc(*head - token + 1); | |
| 276 if (t) { | |
| 277 t[*head - token] = '\0'; | |
| 278 strncpy(t, line[actual] + token, *head - token); | |
| 279 » // remove colon for Finnish and Swedish language | |
| 280 if (t[*head - token - 1] == ':') { | |
| 281 » t[*head - token - 1] = '\0'; | |
| 282 » if (!t[0]) { | |
| 283 » » free(t); | |
| 284 » » return NULL; | |
| 285 » } | |
| 286 » } | |
| 287 return t; | |
| 288 } | |
| 289 fprintf(stderr,"Error - Insufficient Memory\n"); | |
| 290 return NULL; | |
| 291 } | |
| OLD | NEW |