third_party/hunspell_new/src/parsers/textparser.cxx - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/src/parsers/textparser.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #include <cstdlib>

2 #include <cstring>

3 #include <cstdio>

4 #include <ctype.h>

5

6 #include "../hunspell/csutil.hxx"

7 #include "textparser.hxx"

8

9 #ifndef W32

10 using namespace std;

11 #endif

12

13 // ISO-8859-1 HTML character entities

14

15 static const char * LATIN1[] = {

16 "À",

17 "Ã",

18 "Å",

19 "Æ",

20 "È",

21 "Ê",

22 "Ì",

23 "Ï",

24 "Ð",

25 "Ñ",

26 "Ò",

27 "Ø",

28 "Ù",

29 "Þ",

30 "à",

31 "ã",

32 "å",

33 "æ",

34 "è",

35 "ê",

36 "ì",

37 "ï",

38 "ð",

39 "ñ",

40 "ò",

41 "ø",

42 "ù",

43 "þ",

44 "ÿ"

45 };

46

47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))

48

49 TextParser::TextParser() {

50 init((char *) NULL);

51 }

52

53 TextParser::TextParser(const char * wordchars)

54 {

55 init(wordchars);

56 }

57

58 TextParser::TextParser(unsigned short * wordchars, int len)

59 {

60 init(wordchars, len);

61 }

62

63 TextParser::~TextParser()

64 {

65 }

66

67 int TextParser::is_wordchar(char * w)

68 {

69 if (*w == '\0') return 0;

70 if (utf8) {

71 w_char wc;

72 unsigned short idx;

73 u8_u16(&wc, 1, w);

74 idx = (wc.h << 8) + wc.l;

75 return (unicodeisalpha(idx) \|\| (wordchars_utf16 && flag_bsearch( wordchars_utf16, ((unsigned short ) &wc), wclen)));

76 } else {

77 return wordcharacters[(*w + 256) % 256];

78 }

79 }

80

81 const char * TextParser::get_latin1(char * s)

82 {

83 if (s[0] == '&') {

84 unsigned int i = 0;

85 while ((i < LATIN1_LEN) &&

86 strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;

87 if (i != LATIN1_LEN) return LATIN1[i];

88 }

89 return NULL;

90 }

91

92 void TextParser::init(const char * wordchars)

93 {

94 for (int i = 0; i < MAXPREVLINE; i++) {

95 line[i][0] = '\0';

96 }

97 actual = 0;

98 head = 0;

99 token = 0;

100 state = 0;

101 utf8 = 0;

102 checkurl = 0;

103 unsigned int j;

104 for (j = 0; j < 256; j++) {

105 wordcharacters[j] = 0;

106 }

107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ KLYXCVBNM";

108 for (j = 0; j < strlen(wordchars); j++) {

109 wordcharacters[(wordchars[j] + 256) % 256] = 1;

110 }

111 }

112

113 void TextParser::init(unsigned short * wc, int len)

114 {

115 for (int i = 0; i < MAXPREVLINE; i++) {

116 line[i][0] = '\0';

117 }

118 actual = 0;

119 head = 0;

120 token = 0;

121 state = 0;

122 utf8 = 1;

123 checkurl = 0;

124 wordchars_utf16 = wc;

125 wclen = len;

126 }

127

128 int TextParser::next_char(char * line, int * pos) {

129 if ((line + pos) == '\0') return 1;

130 if (utf8) {

131 if ((line + pos) >> 7) {

132 // jump to next UTF-8 character

133 for((pos)++; ((line + pos) & 0xc0) == 0x80; (pos)++);

134 } else {

135 (*pos)++;

136 }

137 } else (*pos)++;

138 return 0;

139 }

140

141 void TextParser::put_line(char * word)

142 {

143 actual = (actual + 1) % MAXPREVLINE;

144 strcpy(line[actual], word);

145 token = 0;

146 head = 0;

147 check_urls();

148 }

149

150 char * TextParser::get_prevline(int n)

151 {

152 return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);

153 }

154

155 char * TextParser::get_line()

156 {

157 return get_prevline(0);

158 }

159

160 char * TextParser::next_token()

161 {

162 const char * latin1;

163

164 for (;;) {

165 switch (state)

166 {

167 case 0: // non word chars

168 if (is_wordchar(line[actual] + head)) {

169 state = 1;

170 token = head;

171 } else if ((latin1 = get_latin1(line[actual] + head))) {

172 state = 1;

173 token = head;

174 head += strlen(latin1);

175 }

176 break;

177 case 1: // wordchar

178 if ((latin1 = get_latin1(line[actual] + head))) {

179 head += strlen(latin1);

180 } else if (! is_wordchar(line[actual] + head)) {

181 state = 0;

182 char * t = alloc_token(token, &head);

183 if (t) return t;

184 }

185 break;

186 }

187 if (next_char(line[actual], &head)) return NULL;

188 }

189 }

190

191 int TextParser::get_tokenpos()

192 {

193 return token;

194 }

195

196 int TextParser::change_token(const char * word)

197 {

198 if (word) {

199 char * r = mystrdup(line[actual] + head);

200 strcpy(line[actual] + token, word);

201 strcat(line[actual], r);

202 head = token;

203 free(r);

204 return 1;

205 }

206 return 0;

207 }

208

209 void TextParser::check_urls()

210 {

211 int url_state = 0;

212 int url_head = 0;

213 int url_token = 0;

214 int url = 0;

215 for (;;) {

216 switch (url_state)

217 {

218 case 0: // non word chars

219 if (is_wordchar(line[actual] + url_head)) {

220 url_state = 1;

221 url_token = url_head;

222 // Unix path

223 } else if (*(line[actual] + url_head) == '/') {

224 url_state = 1;

225 url_token = url_head;

226 url = 1;

227 }

228 break;

229 case 1: // wordchar

230 char ch = *(line[actual] + url_head);

231 // e-mail address

232 if ((ch == '@') \|\|

233 // MS-DOS, Windows path

234 (strncmp(line[actual] + url_head, ":\\", 2) == 0) \|\|

235 // URL

236 (strncmp(line[actual] + url_head, "://", 3) == 0)) {

237 url = 1;

238 } else if (! (is_wordchar(line[actual] + url_head) \|\|

239 (ch == '-') \|\| (ch == '_') \|\| (ch == '\\') \|\|

240 (ch == '.') \|\| (ch == ':') \|\| (ch == '/') \|\|

241 (ch == '~') \|\| (ch == '%') \|\| (ch == '*') \|\|

242 (ch == '$') \|\| (ch == '[') \|\| (ch == ']') \|\|

243 (ch == '?') \|\| (ch == '!') \|\|

244 ((ch >= '0') && (ch <= '9')))) {

245 url_state = 0;

246 if (url == 1) {

247 for (int i = url_token; i < url_head; i+ +) {

248 *(urlline + i) = 1;

249 }

250 }

251 url = 0;

252 }

253 break;

254 }

255 *(urlline + url_head) = 0;

256 if (next_char(line[actual], &url_head)) return;

257 }

258 }

259

260 int TextParser::get_url(int token_pos, int * head)

261 {

262 for (int i = head; urlline[i] && (line[actual]+i); i++, (*head)++);

263 return checkurl ? 0 : urlline[token_pos];

264 }

265

266 void TextParser::set_url_checking(int check)

267 {

268 checkurl = check;

269 }

270

271

272 char * TextParser::alloc_token(int token, int * head)

273 {

274 if (get_url(token, head)) return NULL;

275 char * t = (char ) malloc(head - token + 1);

276 if (t) {

277 t[*head - token] = '\0';

278 strncpy(t, line[actual] + token, *head - token);

279 // remove colon for Finnish and Swedish language

280 if (t[*head - token - 1] == ':') {

281 t[*head - token - 1] = '\0';

282 if (!t[0]) {

283 free(t);

284 return NULL;

285 }

286 }

287 return t;

288 }

289 fprintf(stderr,"Error - Insufficient Memory\n");

290 return NULL;

291 }

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/src/parsers/textparser.hxx ('k') | third_party/hunspell_new/tests/1463589.aff » ('j') | no next file with comments »