third_party/hunspell/src/parsers/textparser.cxx - Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4

Side by Side Diff: third_party/hunspell/src/parsers/textparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)

Patch Set: Test Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	1 /* *** BEGIN LICENSE BLOCK ***

	2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1

	3 *

	4 * The contents of this file are subject to the Mozilla Public License Version

	5 * 1.1 (the "License"); you may not use this file except in compliance with

	6 * the License. You may obtain a copy of the License at

	7 * http://www.mozilla.org/MPL/

	8 *

	9 * Software distributed under the License is distributed on an "AS IS" basis,

	10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

	11 * for the specific language governing rights and limitations under the

	12 * License.

	13 *

	14 * The Original Code is Hunspell, based on MySpell.

	15 *

	16 * The Initial Developers of the Original Code are

	17 * Kevin Hendricks (MySpell) and Németh László (Hunspell).

	18 * Portions created by the Initial Developers are Copyright (C) 2002-2005

	19 * the Initial Developers. All Rights Reserved.

	20 *

	21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,

	22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,

	23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,

	24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,

	25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen

	26 *

	27 * Alternatively, the contents of this file may be used under the terms of

	28 * either the GNU General Public License Version 2 or later (the "GPL"), or

	29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),

	30 * in which case the provisions of the GPL or the LGPL are applicable instead

	31 * of those above. If you wish to allow use of your version of this file only

	32 * under the terms of either the GPL or the LGPL, and not to allow others to

	33 * use your version of this file under the terms of the MPL, indicate your

	34 * decision by deleting the provisions above and replace them with the notice

	35 * and other provisions required by the GPL or the LGPL. If you do not delete

	36 * the provisions above, a recipient may use your version of this file under

	37 * the terms of any one of the MPL, the GPL or the LGPL.

	38 *

	39 * *** END LICENSE BLOCK *** */

	40

1 #include <cstdlib>	41 #include <cstdlib>

2 #include <cstring>	42 #include <cstring>

3 #include <cstdio>	43 #include <cstdio>

4 #include <ctype.h>	44 #include <ctype.h>

5	45

6 #include "../hunspell/csutil.hxx"	46 #include "../hunspell/csutil.hxx"

7 #include "textparser.hxx"	47 #include "textparser.hxx"

8	48

	49 #include <algorithm>

	50

9 #ifndef W32	51 #ifndef W32

10 using namespace std;	52 using namespace std;

11 #endif	53 #endif

12	54

13 // ISO-8859-1 HTML character entities	55 // ISO-8859-1 HTML character entities

14	56

15 static const char * LATIN1[] = {	57 static const char* LATIN1[] = {

16 » "À",	58 "À", "Ã", "Å", "Æ", "È", "Ê",

17 » "Ã",	59 "Ì", "Ï", "Ð", "Ñ", "Ò", "Ø",

18 » "Å",	60 "Ù", "Þ", "à", "ã", "å", "æ",

19 » "Æ",	61 "è", "ê", "ì", "ï", "ð", "ñ",

20 » "È",	62 "ò", "ø", "ù", "þ", "ÿ"};

21 » "Ê",	63

22 » "Ì",	64 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))

23 » "Ï",	65

24 » "Ð",	66 #define ENTITY_APOS "'"

25 » "Ñ",	67 #define UTF8_APOS "\xe2\x80\x99"

26 » "Ò",	68 #define APOSTROPHE "'"

27 » "Ø",	69

28 » "Ù",	70 TextParser::TextParser(const char* wordchars) {

29 » "Þ",	71 init(wordchars);

30 » "à",	72 }

31 » "ã",	73

32 » "å",	74 TextParser::TextParser(const w_char* wordchars, int len) {

33 » "æ",	75 init(wordchars, len);

34 » "è",	76 }

35 » "ê",	77

36 » "ì",	78 TextParser::~TextParser() {}

37 » "ï",	79

38 » "ð",	80 int TextParser::is_wordchar(const char* w) {

39 » "ñ",	81 if (*w == '\0')

40 » "ò",	82 return 0;

41 » "ø",	83 if (utf8) {

42 » "ù",	84 std::vector<w_char> wc;

43 » "þ",	85 unsigned short idx;

44 » "ÿ"	86 u8_u16(wc, w);

45 };	87 if (wc.empty())

46	88 return 0;

47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))	89 idx = (wc[0].h << 8) + wc[0].l;

48	90 return (unicodeisalpha(idx) \|\|

49 TextParser::TextParser() {	91 (wordchars_utf16 &&

50 » init((char *) NULL);	92 std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0]) ));

51 }	93 } else {

52	94 return wordcharacters[(*w + 256) % 256];

53 TextParser::TextParser(const char * wordchars)	95 }

54 {	96 }

55 » init(wordchars);	97

56 }	98 const char* TextParser::get_latin1(const char* s) {

57	99 if (s[0] == '&') {

58 TextParser::TextParser(unsigned short * wordchars, int len)	100 unsigned int i = 0;

59 {	101 while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])))

60 » init(wordchars, len);	102 i++;

61 }	103 if (i != LATIN1_LEN)

62	104 return LATIN1[i];

63 TextParser::~TextParser()	105 }

64 {	106 return NULL;

65 }	107 }

66	108

67 int TextParser::is_wordchar(char * w)	109 void TextParser::init(const char* wordchars) {

68 {	110 actual = 0;

69 if (*w == '\0') return 0;	111 head = 0;

70 » if (utf8) {	112 token = 0;

71 w_char wc;	113 state = 0;

72 unsigned short idx;	114 utf8 = 0;

73 » » u8_u16(&wc, 1, w);	115 checkurl = 0;

74 idx = (wc.h << 8) + wc.l;	116 wordchars_utf16 = NULL;

75 return (unicodeisalpha(idx) \|\| (wordchars_utf16 && flag_bsearch( wordchars_utf16, ((unsigned short ) &wc), wclen)));	117 wclen = 0;

76 } else {	118 unsigned int j;

77 » » return wordcharacters[(*w + 256) % 256];	119 for (j = 0; j < 256; j++) {

78 » }	120 wordcharacters[j] = 0;

79 }	121 }

80	122 if (!wordchars)

81 const char * TextParser::get_latin1(char * s)	123 wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";

82 {	124 for (j = 0; j < strlen(wordchars); j++) {

83 » if (s[0] == '&') {	125 wordcharacters[(wordchars[j] + 256) % 256] = 1;

84 » » unsigned int i = 0;	126 }

85 » » while ((i < LATIN1_LEN) &&	127 }

86 » » » strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;	128

87 » » if (i != LATIN1_LEN) return LATIN1[i];	129 void TextParser::init(const w_char* wc, int len) {

88 » }	130 actual = 0;

89 » return NULL;	131 head = 0;

90 }	132 token = 0;

91	133 state = 0;

92 void TextParser::init(const char * wordchars)	134 utf8 = 1;

93 {	135 checkurl = 0;

94 » for (int i = 0; i < MAXPREVLINE; i++) {	136 wordchars_utf16 = wc;

95 » » line[i][0] = '\0';	137 wclen = len;

96 » }	138 }

97 » actual = 0;	139

98 » head = 0;	140 int TextParser::next_char(const char* ln, size_t* pos) {

99 » token = 0;	141 if ((ln + pos) == '\0')

100 » state = 0;	142 return 1;

101 utf8 = 0;	143 if (utf8) {

102 checkurl = 0;	144 if ((ln + pos) >> 7) {

103 » unsigned int j;	145 // jump to next UTF-8 character

104 » for (j = 0; j < 256; j++) {	146 for ((pos)++; ((ln + pos) & 0xc0) == 0x80; (pos)++)

105 » » wordcharacters[j] = 0;	147 ;

106 » }	148 } else {

107 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJ KLYXCVBNM";	149 (*pos)++;

108 » for (j = 0; j < strlen(wordchars); j++) {	150 }

109 » » wordcharacters[(wordchars[j] + 256) % 256] = 1;	151 } else

110 » }	152 (*pos)++;

111 }	153 return 0;

112	154 }

113 void TextParser::init(unsigned short * wc, int len)	155

114 {	156 void TextParser::put_line(const char* word) {

115 » for (int i = 0; i < MAXPREVLINE; i++) {	157 actual = (actual + 1) % MAXPREVLINE;

116 » » line[i][0] = '\0';	158 line[actual].assign(word);

117 » }	159 token = 0;

118 » actual = 0;	160 head = 0;

119 » head = 0;	161 check_urls();

120 » token = 0;	162 }

121 » state = 0;	163

122 » utf8 = 1;	164 std::string TextParser::get_prevline(int n) const {

123 » checkurl = 0;	165 return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];

124 wordchars_utf16 = wc;	166 }

125 wclen = len;	167

126 }	168 std::string TextParser::get_line() const {

127	169 return get_prevline(0);

128 int TextParser::next_char(char * line, int * pos) {	170 }

129 if ((line + pos) == '\0') return 1;	171

130 » if (utf8) {	172 bool TextParser::next_token(std::string &t) {

131 if ((line + pos) >> 7) {	173 const char* latin1;

132 // jump to next UTF-8 character	174

133 for((pos)++; ((line + pos) & 0xc0) == 0x80; (pos)++);	175 for (;;) {

134 } else {	176 switch (state) {

135 (*pos)++;	177 case 0: // non word chars

	178 if (is_wordchar(line[actual].c_str() + head)) {

	179 state = 1;

	180 token = head;

	181 } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {

	182 state = 1;

	183 token = head;

	184 head += strlen(latin1);

	185 }

	186 break;

	187 case 1: // wordchar

	188 if ((latin1 = get_latin1(line[actual].c_str() + head))) {

	189 head += strlen(latin1);

	190 } else if ((is_wordchar((char*)APOSTROPHE) \|\|

	191 (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&

	192 !line[actual].empty() && line[actual][head] == '\'' &&

	193 is_wordchar(line[actual].c_str() + head + 1)) {

	194 head++;

	195 } else if (is_utf8() &&

	196 is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe

	197 // to the WORDCHARS, if

	198 // needed

	199 strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_A POS)) ==

	200 0 &&

	201 is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {

	202 head += strlen(UTF8_APOS) - 1;

	203 } else if (!is_wordchar(line[actual].c_str() + head)) {

	204 state = 0;

	205 if (alloc_token(token, &head, t))

	206 return true;

	207 }

	208 break;

	209 }

	210 if (next_char(line[actual].c_str(), &head))

	211 return false;

	212 }

	213 }

	214

	215 size_t TextParser::get_tokenpos() {

	216 return token;

	217 }

	218

	219 int TextParser::change_token(const char* word) {

	220 if (word) {

	221 std::string remainder(line[actual].substr(head));

	222 line[actual].resize(token);

	223 line[actual].append(word);

	224 line[actual].append(remainder);

	225 head = token;

	226 return 1;

	227 }

	228 return 0;

	229 }

	230

	231 void TextParser::check_urls() {

	232 urlline.resize(line[actual].size() + 1);

	233 int url_state = 0;

	234 size_t url_head = 0;

	235 size_t url_token = 0;

	236 int url = 0;

	237 for (;;) {

	238 switch (url_state) {

	239 case 0: // non word chars

	240 if (is_wordchar(line[actual].c_str() + url_head)) {

	241 url_state = 1;

	242 url_token = url_head;

	243 // Unix path

	244 } else if (line[actual][url_head] == '/') {

	245 url_state = 1;

	246 url_token = url_head;

	247 url = 1;

	248 }

	249 break;

	250 case 1: // wordchar

	251 char ch = line[actual][url_head];

	252 // e-mail address

	253 if ((ch == '@') \|\|

	254 // MS-DOS, Windows path

	255 (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) \|\|

	256 // URL

	257 (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {

	258 url = 1;

	259 } else if (!(is_wordchar(line[actual].c_str() + url_head) \|\| (ch == '-') \|\|

	260 (ch == '_') \|\| (ch == '\\') \|\| (ch == '.') \|\|

	261 (ch == ':') \|\| (ch == '/') \|\| (ch == '~') \|\| (ch == '%') \|\|

	262 (ch == '*') \|\| (ch == '$') \|\| (ch == '[') \|\| (ch == ']') \|\|

	263 (ch == '?') \|\| (ch == '!') \|\|

	264 ((ch >= '0') && (ch <= '9')))) {

	265 url_state = 0;

	266 if (url == 1) {

	267 for (size_t i = url_token; i < url_head; ++i) {

	268 urlline[i] = true;

136 }	269 }

137 } else (*pos)++;	270 }

138 return 0;	271 url = 0;

139 }	272 }

140	273 break;

141 void TextParser::put_line(char * word)	274 }

142 {	275 urlline[url_head] = false;

143 » actual = (actual + 1) % MAXPREVLINE;	276 if (next_char(line[actual].c_str(), &url_head))

144 » strcpy(line[actual], word);	277 return;

145 » token = 0;	278 }

146 » head = 0;	279 }

147 » check_urls();	280

148 }	281 int TextParser::get_url(size_t token_pos, size_t* hd) {

149	282 for (size_t i = hd; i < line[actual].size() && urlline[i]; i++, (hd)++)

150 char * TextParser::get_prevline(int n)	283 ;

151 {	284 return checkurl ? 0 : urlline[token_pos];

152 » return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);	285 }

153 }	286

154	287 void TextParser::set_url_checking(int check) {

155 char * TextParser::get_line()	288 checkurl = check;

156 {	289 }

157 » return get_prevline(0);	290

158 }	291 bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {

159	292 size_t url_head = *hd;

160 char * TextParser::next_token()	293 if (get_url(tokn, &url_head))

161 {	294 return false;

162 » const char * latin1;	295 t = line[actual].substr(tokn, *hd - tokn);

163 »	296 // remove colon for Finnish and Swedish language

164 » for (;;) {	297 if (!t.empty() && t[t.size() - 1] == ':') {

165 » » switch (state)	298 t.resize(t.size() - 1);

166 » » {	299 if (t.empty()) {

167 » » case 0: // non word chars	300 return false;

168 » » » if (is_wordchar(line[actual] + head)) {	301 }

169 » » » » state = 1;	302 }

170 » » » » token = head;	303 return true;

171 » » » } else if ((latin1 = get_latin1(line[actual] + head))) {	304 }

172 » » » » state = 1;

173 » » » » token = head;

174 » » » » head += strlen(latin1);

175 » » » }

176 » » » break;

177 » » case 1: // wordchar

178 » » » if ((latin1 = get_latin1(line[actual] + head))) {

179 » » » » head += strlen(latin1);

180 » » » } else if (! is_wordchar(line[actual] + head)) {

181 » » » » state = 0;

182 » » » » char * t = alloc_token(token, &head);

183 » » » » if (t) return t;

184 » » » }

185 » » » break;

186 » » }

187 if (next_char(line[actual], &head)) return NULL;

188 » }

189 }

190

191 int TextParser::get_tokenpos()

192 {

193 » return token;

194 }

195

196 int TextParser::change_token(const char * word)

197 {

198 » if (word) {

199 » » char * r = mystrdup(line[actual] + head);

200 » » strcpy(line[actual] + token, word);

201 » » strcat(line[actual], r);

202 » » head = token;

203 » » free(r);

204 » » return 1;

205 » }

206 » return 0;

207 }

208

209 void TextParser::check_urls()

210 {

211 » int url_state = 0;

212 » int url_head = 0;

213 » int url_token = 0;

214 » int url = 0;

215 » for (;;) {

216 » » switch (url_state)

217 » » {

218 » » case 0: // non word chars

219 » » » if (is_wordchar(line[actual] + url_head)) {

220 » » » » url_state = 1;

221 » » » » url_token = url_head;

222 » » » // Unix path

223 » » » } else if (*(line[actual] + url_head) == '/') {

224 » » » » url_state = 1;

225 » » » » url_token = url_head;

226 » » » » url = 1;

227 » » » }

228 » » » break;

229 » » case 1: // wordchar

230 » » » char ch = *(line[actual] + url_head);

231 » » » // e-mail address

232 » » » if ((ch == '@') \|\|

233 » » » // MS-DOS, Windows path

234 » » » (strncmp(line[actual] + url_head, ":\\", 2) == 0) \|\|

235 » » » // URL

236 » » » (strncmp(line[actual] + url_head, "://", 3) == 0)) {

237 » » » » url = 1;

238 » » » } else if (! (is_wordchar(line[actual] + url_head) \|\|

239 » » » (ch == '-') \|\| (ch == '_') \|\| (ch == '\\') \|\|

240 » » » (ch == '.') \|\| (ch == ':') \|\| (ch == '/') \|\|

241 » » » (ch == '~') \|\| (ch == '%') \|\| (ch == '*') \|\|

242 » » » (ch == '$') \|\| (ch == '[') \|\| (ch == ']') \|\|

243 » » » (ch == '?') \|\| (ch == '!') \|\|

244 » » » ((ch >= '0') && (ch <= '9')))) {

245 » » » » url_state = 0;

246 » » » » if (url == 1) {

247 » » » » » for (int i = url_token; i < url_head; i+ +) {

248 » » » » » » *(urlline + i) = 1;

249 » » » » » }

250 » » » » }

251 » » » » url = 0;

252 » » » }

253 » » » break;

254 » » }

255 » » *(urlline + url_head) = 0;

256 if (next_char(line[actual], &url_head)) return;

257 » }

258 }

259

260 int TextParser::get_url(int token_pos, int * head)

261 {

262 » for (int i = head; urlline[i] && (line[actual]+i); i++, (*head)++);

263 » return checkurl ? 0 : urlline[token_pos];

264 }

265

266 void TextParser::set_url_checking(int check)

267 {

268 » checkurl = check;

269 }

270

271

272 char * TextParser::alloc_token(int token, int * head)

273 {

274 if (get_url(token, head)) return NULL;

275 char * t = (char ) malloc(head - token + 1);

276 if (t) {

277 t[*head - token] = '\0';

278 strncpy(t, line[actual] + token, *head - token);

279 » // remove colon for Finnish and Swedish language

280 if (t[*head - token - 1] == ':') {

281 » t[*head - token - 1] = '\0';

282 » if (!t[0]) {

283 » » free(t);

284 » » return NULL;

285 » }

286 » }

287 return t;

288 }

289 fprintf(stderr,"Error - Insufficient Memory\n");

290 return NULL;

291 }

OLD	NEW

« no previous file with comments | « third_party/hunspell/src/parsers/textparser.hxx ('k') | third_party/hunspell/src/parsers/xmlparser.hxx » ('j') | no next file with comments »