Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(22)

Side by Side Diff: third_party/hunspell_new/src/parsers/textparser.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #include <cstdlib>
2 #include <cstring>
3 #include <cstdio>
4 #include <ctype.h>
5
6 #include "../hunspell/csutil.hxx"
7 #include "textparser.hxx"
8
9 #ifndef W32
10 using namespace std;
11 #endif
12
13 // ISO-8859-1 HTML character entities
14
15 static const char * LATIN1[] = {
16 "&Agrave;",
17 "&Atilde;",
18 "&Aring;",
19 "&AElig;",
20 "&Egrave;",
21 "&Ecirc;",
22 "&Igrave;",
23 "&Iuml;",
24 "&ETH;",
25 "&Ntilde;",
26 "&Ograve;",
27 "&Oslash;",
28 "&Ugrave;",
29 "&THORN;",
30 "&agrave;",
31 "&atilde;",
32 "&aring;",
33 "&aelig;",
34 "&egrave;",
35 "&ecirc;",
36 "&igrave;",
37 "&iuml;",
38 "&eth;",
39 "&ntilde;",
40 "&ograve;",
41 "&oslash;",
42 "&ugrave;",
43 "&thorn;",
44 "&yuml;"
45 };
46
47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
48
49 TextParser::TextParser() {
50 init((char *) NULL);
51 }
52
53 TextParser::TextParser(const char * wordchars)
54 {
55 init(wordchars);
56 }
57
58 TextParser::TextParser(unsigned short * wordchars, int len)
59 {
60 init(wordchars, len);
61 }
62
63 TextParser::~TextParser()
64 {
65 }
66
67 int TextParser::is_wordchar(char * w)
68 {
69 if (*w == '\0') return 0;
70 if (utf8) {
71 w_char wc;
72 unsigned short idx;
73 u8_u16(&wc, 1, w);
74 idx = (wc.h << 8) + wc.l;
75 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch( wordchars_utf16, *((unsigned short *) &wc), wclen)));
76 } else {
77 return wordcharacters[(*w + 256) % 256];
78 }
79 }
80
81 const char * TextParser::get_latin1(char * s)
82 {
83 if (s[0] == '&') {
84 unsigned int i = 0;
85 while ((i < LATIN1_LEN) &&
86 strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;
87 if (i != LATIN1_LEN) return LATIN1[i];
88 }
89 return NULL;
90 }
91
92 void TextParser::init(const char * wordchars)
93 {
94 for (int i = 0; i < MAXPREVLINE; i++) {
95 line[i][0] = '\0';
96 }
97 actual = 0;
98 head = 0;
99 token = 0;
100 state = 0;
101 utf8 = 0;
102 checkurl = 0;
103 unsigned int j;
104 for (j = 0; j < 256; j++) {
105 wordcharacters[j] = 0;
106 }
107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ KLYXCVBNM";
108 for (j = 0; j < strlen(wordchars); j++) {
109 wordcharacters[(wordchars[j] + 256) % 256] = 1;
110 }
111 }
112
113 void TextParser::init(unsigned short * wc, int len)
114 {
115 for (int i = 0; i < MAXPREVLINE; i++) {
116 line[i][0] = '\0';
117 }
118 actual = 0;
119 head = 0;
120 token = 0;
121 state = 0;
122 utf8 = 1;
123 checkurl = 0;
124 wordchars_utf16 = wc;
125 wclen = len;
126 }
127
128 int TextParser::next_char(char * line, int * pos) {
129 if (*(line + *pos) == '\0') return 1;
130 if (utf8) {
131 if (*(line + *pos) >> 7) {
132 // jump to next UTF-8 character
133 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);
134 } else {
135 (*pos)++;
136 }
137 } else (*pos)++;
138 return 0;
139 }
140
141 void TextParser::put_line(char * word)
142 {
143 actual = (actual + 1) % MAXPREVLINE;
144 strcpy(line[actual], word);
145 token = 0;
146 head = 0;
147 check_urls();
148 }
149
150 char * TextParser::get_prevline(int n)
151 {
152 return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);
153 }
154
155 char * TextParser::get_line()
156 {
157 return get_prevline(0);
158 }
159
160 char * TextParser::next_token()
161 {
162 const char * latin1;
163
164 for (;;) {
165 switch (state)
166 {
167 case 0: // non word chars
168 if (is_wordchar(line[actual] + head)) {
169 state = 1;
170 token = head;
171 } else if ((latin1 = get_latin1(line[actual] + head))) {
172 state = 1;
173 token = head;
174 head += strlen(latin1);
175 }
176 break;
177 case 1: // wordchar
178 if ((latin1 = get_latin1(line[actual] + head))) {
179 head += strlen(latin1);
180 } else if (! is_wordchar(line[actual] + head)) {
181 state = 0;
182 char * t = alloc_token(token, &head);
183 if (t) return t;
184 }
185 break;
186 }
187 if (next_char(line[actual], &head)) return NULL;
188 }
189 }
190
191 int TextParser::get_tokenpos()
192 {
193 return token;
194 }
195
196 int TextParser::change_token(const char * word)
197 {
198 if (word) {
199 char * r = mystrdup(line[actual] + head);
200 strcpy(line[actual] + token, word);
201 strcat(line[actual], r);
202 head = token;
203 free(r);
204 return 1;
205 }
206 return 0;
207 }
208
209 void TextParser::check_urls()
210 {
211 int url_state = 0;
212 int url_head = 0;
213 int url_token = 0;
214 int url = 0;
215 for (;;) {
216 switch (url_state)
217 {
218 case 0: // non word chars
219 if (is_wordchar(line[actual] + url_head)) {
220 url_state = 1;
221 url_token = url_head;
222 // Unix path
223 } else if (*(line[actual] + url_head) == '/') {
224 url_state = 1;
225 url_token = url_head;
226 url = 1;
227 }
228 break;
229 case 1: // wordchar
230 char ch = *(line[actual] + url_head);
231 // e-mail address
232 if ((ch == '@') ||
233 // MS-DOS, Windows path
234 (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
235 // URL
236 (strncmp(line[actual] + url_head, "://", 3) == 0)) {
237 url = 1;
238 } else if (! (is_wordchar(line[actual] + url_head) ||
239 (ch == '-') || (ch == '_') || (ch == '\\') ||
240 (ch == '.') || (ch == ':') || (ch == '/') ||
241 (ch == '~') || (ch == '%') || (ch == '*') ||
242 (ch == '$') || (ch == '[') || (ch == ']') ||
243 (ch == '?') || (ch == '!') ||
244 ((ch >= '0') && (ch <= '9')))) {
245 url_state = 0;
246 if (url == 1) {
247 for (int i = url_token; i < url_head; i+ +) {
248 *(urlline + i) = 1;
249 }
250 }
251 url = 0;
252 }
253 break;
254 }
255 *(urlline + url_head) = 0;
256 if (next_char(line[actual], &url_head)) return;
257 }
258 }
259
260 int TextParser::get_url(int token_pos, int * head)
261 {
262 for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
263 return checkurl ? 0 : urlline[token_pos];
264 }
265
266 void TextParser::set_url_checking(int check)
267 {
268 checkurl = check;
269 }
270
271
272 char * TextParser::alloc_token(int token, int * head)
273 {
274 if (get_url(token, head)) return NULL;
275 char * t = (char *) malloc(*head - token + 1);
276 if (t) {
277 t[*head - token] = '\0';
278 strncpy(t, line[actual] + token, *head - token);
279 // remove colon for Finnish and Swedish language
280 if (t[*head - token - 1] == ':') {
281 t[*head - token - 1] = '\0';
282 if (!t[0]) {
283 free(t);
284 return NULL;
285 }
286 }
287 return t;
288 }
289 fprintf(stderr,"Error - Insufficient Memory\n");
290 return NULL;
291 }
OLDNEW
« no previous file with comments | « third_party/hunspell_new/src/parsers/textparser.hxx ('k') | third_party/hunspell_new/tests/1463589.aff » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698