Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(128)

Side by Side Diff: third_party/hunspell/src/parsers/textparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)
Patch Set: Test Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Hunspell, based on MySpell.
15 *
16 * The Initial Developers of the Original Code are
17 * Kevin Hendricks (MySpell) and Németh László (Hunspell).
18 * Portions created by the Initial Developers are Copyright (C) 2002-2005
19 * the Initial Developers. All Rights Reserved.
20 *
21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either the GNU General Public License Version 2 or later (the "GPL"), or
29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40
1 #include <cstdlib> 41 #include <cstdlib>
2 #include <cstring> 42 #include <cstring>
3 #include <cstdio> 43 #include <cstdio>
4 #include <ctype.h> 44 #include <ctype.h>
5 45
6 #include "../hunspell/csutil.hxx" 46 #include "../hunspell/csutil.hxx"
7 #include "textparser.hxx" 47 #include "textparser.hxx"
8 48
49 #include <algorithm>
50
9 #ifndef W32 51 #ifndef W32
10 using namespace std; 52 using namespace std;
11 #endif 53 #endif
12 54
13 // ISO-8859-1 HTML character entities 55 // ISO-8859-1 HTML character entities
14 56
15 static const char * LATIN1[] = { 57 static const char* LATIN1[] = {
16 » "&Agrave;", 58 "&Agrave;", "&Atilde;", "&Aring;", "&AElig;", "&Egrave;", "&Ecirc;",
17 » "&Atilde;", 59 "&Igrave;", "&Iuml;", "&ETH;", "&Ntilde;", "&Ograve;", "&Oslash;",
18 » "&Aring;", 60 "&Ugrave;", "&THORN;", "&agrave;", "&atilde;", "&aring;", "&aelig;",
19 » "&AElig;", 61 "&egrave;", "&ecirc;", "&igrave;", "&iuml;", "&eth;", "&ntilde;",
20 » "&Egrave;", 62 "&ograve;", "&oslash;", "&ugrave;", "&thorn;", "&yuml;"};
21 » "&Ecirc;", 63
22 » "&Igrave;", 64 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))
23 » "&Iuml;", 65
24 » "&ETH;", 66 #define ENTITY_APOS "&apos;"
25 » "&Ntilde;", 67 #define UTF8_APOS "\xe2\x80\x99"
26 » "&Ograve;", 68 #define APOSTROPHE "'"
27 » "&Oslash;", 69
28 » "&Ugrave;", 70 TextParser::TextParser(const char* wordchars) {
29 » "&THORN;", 71 init(wordchars);
30 » "&agrave;", 72 }
31 » "&atilde;", 73
32 » "&aring;", 74 TextParser::TextParser(const w_char* wordchars, int len) {
33 » "&aelig;", 75 init(wordchars, len);
34 » "&egrave;", 76 }
35 » "&ecirc;", 77
36 » "&igrave;", 78 TextParser::~TextParser() {}
37 » "&iuml;", 79
38 » "&eth;", 80 int TextParser::is_wordchar(const char* w) {
39 » "&ntilde;", 81 if (*w == '\0')
40 » "&ograve;", 82 return 0;
41 » "&oslash;", 83 if (utf8) {
42 » "&ugrave;", 84 std::vector<w_char> wc;
43 » "&thorn;", 85 unsigned short idx;
44 » "&yuml;" 86 u8_u16(wc, w);
45 }; 87 if (wc.empty())
46 88 return 0;
47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *)) 89 idx = (wc[0].h << 8) + wc[0].l;
48 90 return (unicodeisalpha(idx) ||
49 TextParser::TextParser() { 91 (wordchars_utf16 &&
50 » init((char *) NULL); 92 std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0]) ));
51 } 93 } else {
52 94 return wordcharacters[(*w + 256) % 256];
53 TextParser::TextParser(const char * wordchars) 95 }
54 { 96 }
55 » init(wordchars); 97
56 } 98 const char* TextParser::get_latin1(const char* s) {
57 99 if (s[0] == '&') {
58 TextParser::TextParser(unsigned short * wordchars, int len) 100 unsigned int i = 0;
59 { 101 while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])))
60 » init(wordchars, len); 102 i++;
61 } 103 if (i != LATIN1_LEN)
62 104 return LATIN1[i];
63 TextParser::~TextParser() 105 }
64 { 106 return NULL;
65 } 107 }
66 108
67 int TextParser::is_wordchar(char * w) 109 void TextParser::init(const char* wordchars) {
68 { 110 actual = 0;
69 if (*w == '\0') return 0; 111 head = 0;
70 » if (utf8) { 112 token = 0;
71 w_char wc; 113 state = 0;
72 unsigned short idx; 114 utf8 = 0;
73 » » u8_u16(&wc, 1, w); 115 checkurl = 0;
74 idx = (wc.h << 8) + wc.l; 116 wordchars_utf16 = NULL;
75 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch( wordchars_utf16, *((unsigned short *) &wc), wclen))); 117 wclen = 0;
76 } else { 118 unsigned int j;
77 » » return wordcharacters[(*w + 256) % 256]; 119 for (j = 0; j < 256; j++) {
78 » } 120 wordcharacters[j] = 0;
79 } 121 }
80 122 if (!wordchars)
81 const char * TextParser::get_latin1(char * s) 123 wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
82 { 124 for (j = 0; j < strlen(wordchars); j++) {
83 » if (s[0] == '&') { 125 wordcharacters[(wordchars[j] + 256) % 256] = 1;
84 » » unsigned int i = 0; 126 }
85 » » while ((i < LATIN1_LEN) && 127 }
86 » » » strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++; 128
87 » » if (i != LATIN1_LEN) return LATIN1[i]; 129 void TextParser::init(const w_char* wc, int len) {
88 » } 130 actual = 0;
89 » return NULL; 131 head = 0;
90 } 132 token = 0;
91 133 state = 0;
92 void TextParser::init(const char * wordchars) 134 utf8 = 1;
93 { 135 checkurl = 0;
94 » for (int i = 0; i < MAXPREVLINE; i++) { 136 wordchars_utf16 = wc;
95 » » line[i][0] = '\0'; 137 wclen = len;
96 » } 138 }
97 » actual = 0; 139
98 » head = 0; 140 int TextParser::next_char(const char* ln, size_t* pos) {
99 » token = 0; 141 if (*(ln + *pos) == '\0')
100 » state = 0; 142 return 1;
101 utf8 = 0; 143 if (utf8) {
102 checkurl = 0; 144 if (*(ln + *pos) >> 7) {
103 » unsigned int j; 145 // jump to next UTF-8 character
104 » for (j = 0; j < 256; j++) { 146 for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)
105 » » wordcharacters[j] = 0; 147 ;
106 » } 148 } else {
107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ KLYXCVBNM"; 149 (*pos)++;
108 » for (j = 0; j < strlen(wordchars); j++) { 150 }
109 » » wordcharacters[(wordchars[j] + 256) % 256] = 1; 151 } else
110 » } 152 (*pos)++;
111 } 153 return 0;
112 154 }
113 void TextParser::init(unsigned short * wc, int len) 155
114 { 156 void TextParser::put_line(const char* word) {
115 » for (int i = 0; i < MAXPREVLINE; i++) { 157 actual = (actual + 1) % MAXPREVLINE;
116 » » line[i][0] = '\0'; 158 line[actual].assign(word);
117 » } 159 token = 0;
118 » actual = 0; 160 head = 0;
119 » head = 0; 161 check_urls();
120 » token = 0; 162 }
121 » state = 0; 163
122 » utf8 = 1; 164 std::string TextParser::get_prevline(int n) const {
123 » checkurl = 0; 165 return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
124 wordchars_utf16 = wc; 166 }
125 wclen = len; 167
126 } 168 std::string TextParser::get_line() const {
127 169 return get_prevline(0);
128 int TextParser::next_char(char * line, int * pos) { 170 }
129 if (*(line + *pos) == '\0') return 1; 171
130 » if (utf8) { 172 bool TextParser::next_token(std::string &t) {
131 if (*(line + *pos) >> 7) { 173 const char* latin1;
132 // jump to next UTF-8 character 174
133 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++); 175 for (;;) {
134 } else { 176 switch (state) {
135 (*pos)++; 177 case 0: // non word chars
178 if (is_wordchar(line[actual].c_str() + head)) {
179 state = 1;
180 token = head;
181 } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
182 state = 1;
183 token = head;
184 head += strlen(latin1);
185 }
186 break;
187 case 1: // wordchar
188 if ((latin1 = get_latin1(line[actual].c_str() + head))) {
189 head += strlen(latin1);
190 } else if ((is_wordchar((char*)APOSTROPHE) ||
191 (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
192 !line[actual].empty() && line[actual][head] == '\'' &&
193 is_wordchar(line[actual].c_str() + head + 1)) {
194 head++;
195 } else if (is_utf8() &&
196 is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
197 // to the WORDCHARS, if
198 // needed
199 strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_A POS)) ==
200 0 &&
201 is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
202 head += strlen(UTF8_APOS) - 1;
203 } else if (!is_wordchar(line[actual].c_str() + head)) {
204 state = 0;
205 if (alloc_token(token, &head, t))
206 return true;
207 }
208 break;
209 }
210 if (next_char(line[actual].c_str(), &head))
211 return false;
212 }
213 }
214
215 size_t TextParser::get_tokenpos() {
216 return token;
217 }
218
219 int TextParser::change_token(const char* word) {
220 if (word) {
221 std::string remainder(line[actual].substr(head));
222 line[actual].resize(token);
223 line[actual].append(word);
224 line[actual].append(remainder);
225 head = token;
226 return 1;
227 }
228 return 0;
229 }
230
231 void TextParser::check_urls() {
232 urlline.resize(line[actual].size() + 1);
233 int url_state = 0;
234 size_t url_head = 0;
235 size_t url_token = 0;
236 int url = 0;
237 for (;;) {
238 switch (url_state) {
239 case 0: // non word chars
240 if (is_wordchar(line[actual].c_str() + url_head)) {
241 url_state = 1;
242 url_token = url_head;
243 // Unix path
244 } else if (line[actual][url_head] == '/') {
245 url_state = 1;
246 url_token = url_head;
247 url = 1;
248 }
249 break;
250 case 1: // wordchar
251 char ch = line[actual][url_head];
252 // e-mail address
253 if ((ch == '@') ||
254 // MS-DOS, Windows path
255 (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||
256 // URL
257 (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
258 url = 1;
259 } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||
260 (ch == '_') || (ch == '\\') || (ch == '.') ||
261 (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||
262 (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||
263 (ch == '?') || (ch == '!') ||
264 ((ch >= '0') && (ch <= '9')))) {
265 url_state = 0;
266 if (url == 1) {
267 for (size_t i = url_token; i < url_head; ++i) {
268 urlline[i] = true;
136 } 269 }
137 } else (*pos)++; 270 }
138 return 0; 271 url = 0;
139 } 272 }
140 273 break;
141 void TextParser::put_line(char * word) 274 }
142 { 275 urlline[url_head] = false;
143 » actual = (actual + 1) % MAXPREVLINE; 276 if (next_char(line[actual].c_str(), &url_head))
144 » strcpy(line[actual], word); 277 return;
145 » token = 0; 278 }
146 » head = 0; 279 }
147 » check_urls(); 280
148 } 281 int TextParser::get_url(size_t token_pos, size_t* hd) {
149 282 for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)
150 char * TextParser::get_prevline(int n) 283 ;
151 { 284 return checkurl ? 0 : urlline[token_pos];
152 » return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]); 285 }
153 } 286
154 287 void TextParser::set_url_checking(int check) {
155 char * TextParser::get_line() 288 checkurl = check;
156 { 289 }
157 » return get_prevline(0); 290
158 } 291 bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
159 292 size_t url_head = *hd;
160 char * TextParser::next_token() 293 if (get_url(tokn, &url_head))
161 { 294 return false;
162 » const char * latin1; 295 t = line[actual].substr(tokn, *hd - tokn);
163 » 296 // remove colon for Finnish and Swedish language
164 » for (;;) { 297 if (!t.empty() && t[t.size() - 1] == ':') {
165 » » switch (state) 298 t.resize(t.size() - 1);
166 » » { 299 if (t.empty()) {
167 » » case 0: // non word chars 300 return false;
168 » » » if (is_wordchar(line[actual] + head)) { 301 }
169 » » » » state = 1; 302 }
170 » » » » token = head; 303 return true;
171 » » » } else if ((latin1 = get_latin1(line[actual] + head))) { 304 }
172 » » » » state = 1;
173 » » » » token = head;
174 » » » » head += strlen(latin1);
175 » » » }
176 » » » break;
177 » » case 1: // wordchar
178 » » » if ((latin1 = get_latin1(line[actual] + head))) {
179 » » » » head += strlen(latin1);
180 » » » } else if (! is_wordchar(line[actual] + head)) {
181 » » » » state = 0;
182 » » » » char * t = alloc_token(token, &head);
183 » » » » if (t) return t;
184 » » » }
185 » » » break;
186 » » }
187 if (next_char(line[actual], &head)) return NULL;
188 » }
189 }
190
191 int TextParser::get_tokenpos()
192 {
193 » return token;
194 }
195
196 int TextParser::change_token(const char * word)
197 {
198 » if (word) {
199 » » char * r = mystrdup(line[actual] + head);
200 » » strcpy(line[actual] + token, word);
201 » » strcat(line[actual], r);
202 » » head = token;
203 » » free(r);
204 » » return 1;
205 » }
206 » return 0;
207 }
208
209 void TextParser::check_urls()
210 {
211 » int url_state = 0;
212 » int url_head = 0;
213 » int url_token = 0;
214 » int url = 0;
215 » for (;;) {
216 » » switch (url_state)
217 » » {
218 » » case 0: // non word chars
219 » » » if (is_wordchar(line[actual] + url_head)) {
220 » » » » url_state = 1;
221 » » » » url_token = url_head;
222 » » » // Unix path
223 » » » } else if (*(line[actual] + url_head) == '/') {
224 » » » » url_state = 1;
225 » » » » url_token = url_head;
226 » » » » url = 1;
227 » » » }
228 » » » break;
229 » » case 1: // wordchar
230 » » » char ch = *(line[actual] + url_head);
231 » » » // e-mail address
232 » » » if ((ch == '@') ||
233 » » » // MS-DOS, Windows path
234 » » » (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
235 » » » // URL
236 » » » (strncmp(line[actual] + url_head, "://", 3) == 0)) {
237 » » » » url = 1;
238 » » » } else if (! (is_wordchar(line[actual] + url_head) ||
239 » » » (ch == '-') || (ch == '_') || (ch == '\\') ||
240 » » » (ch == '.') || (ch == ':') || (ch == '/') ||
241 » » » (ch == '~') || (ch == '%') || (ch == '*') ||
242 » » » (ch == '$') || (ch == '[') || (ch == ']') ||
243 » » » (ch == '?') || (ch == '!') ||
244 » » » ((ch >= '0') && (ch <= '9')))) {
245 » » » » url_state = 0;
246 » » » » if (url == 1) {
247 » » » » » for (int i = url_token; i < url_head; i+ +) {
248 » » » » » » *(urlline + i) = 1;
249 » » » » » }
250 » » » » }
251 » » » » url = 0;
252 » » » }
253 » » » break;
254 » » }
255 » » *(urlline + url_head) = 0;
256 if (next_char(line[actual], &url_head)) return;
257 » }
258 }
259
260 int TextParser::get_url(int token_pos, int * head)
261 {
262 » for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
263 » return checkurl ? 0 : urlline[token_pos];
264 }
265
266 void TextParser::set_url_checking(int check)
267 {
268 » checkurl = check;
269 }
270
271
272 char * TextParser::alloc_token(int token, int * head)
273 {
274 if (get_url(token, head)) return NULL;
275 char * t = (char *) malloc(*head - token + 1);
276 if (t) {
277 t[*head - token] = '\0';
278 strncpy(t, line[actual] + token, *head - token);
279 » // remove colon for Finnish and Swedish language
280 if (t[*head - token - 1] == ':') {
281 » t[*head - token - 1] = '\0';
282 » if (!t[0]) {
283 » » free(t);
284 » » return NULL;
285 » }
286 » }
287 return t;
288 }
289 fprintf(stderr,"Error - Insufficient Memory\n");
290 return NULL;
291 }
OLDNEW
« no previous file with comments | « third_party/hunspell/src/parsers/textparser.hxx ('k') | third_party/hunspell/src/parsers/xmlparser.hxx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698