Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(137)

Side by Side Diff: third_party/hunspell/src/parsers/xmlparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)
Patch Set: Test Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/hunspell/src/parsers/xmlparser.hxx ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Hunspell, based on MySpell.
15 *
16 * The Initial Developers of the Original Code are
17 * Kevin Hendricks (MySpell) and Németh László (Hunspell).
18 * Portions created by the Initial Developers are Copyright (C) 2002-2005
19 * the Initial Developers. All Rights Reserved.
20 *
21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either the GNU General Public License Version 2 or later (the "GPL"), or
29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40
41 #include <cstdlib>
42 #include <cstring>
43 #include <cstdio>
44 #include <ctype.h>
45
46 #include "../hunspell/csutil.hxx"
47 #include "xmlparser.hxx"
48
49 #ifndef W32
50 using namespace std;
51 #endif
52
53 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
54
55 static const char* __PATTERN__[][2] = {{"<!--", "-->"},
56 {"<[cdata[", "]]>"}, // XML comment
57 {"<", ">"}};
58
59 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
60
61 static const char* (*__PATTERN2__)[2] = NULL;
62
63 #define __PATTERN_LEN2__ 0
64
65 #define ENTITY_APOS "&apos;"
66 #define UTF8_APOS "\xe2\x80\x99"
67 #define APOSTROPHE "'"
68
69 XMLParser::XMLParser(const char* wordchars)
70 : TextParser(wordchars)
71 , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
72 }
73
74 XMLParser::XMLParser(const w_char* wordchars, int len)
75 : TextParser(wordchars, len)
76 , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
77 }
78
79 XMLParser::~XMLParser() {}
80
81 int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
82 for (unsigned int i = 0; i < len; i++) {
83 const char* j = line[actual].c_str() + head;
84 const char* k = p[i][column];
85 while ((*k != '\0') && (tolower(*j) == *k)) {
86 j++;
87 k++;
88 }
89 if (*k == '\0')
90 return i;
91 }
92 return -1;
93 }
94
95 /*
96 * XML parser
97 *
98 */
99
100 bool XMLParser::next_token(const char* PATTERN[][2],
101 unsigned int PATTERN_LEN,
102 const char* PATTERN2[][2],
103 unsigned int PATTERN_LEN2,
104 std::string& t) {
105 t.clear();
106 const char* latin1;
107
108 for (;;) {
109 switch (state) {
110 case ST_NON_WORD: // non word chars
111 prevstate = ST_NON_WORD;
112 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
113 checkattr = 0;
114 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
115 checkattr = 1;
116 }
117 state = ST_TAG;
118 } else if (is_wordchar(line[actual].c_str() + head)) {
119 state = ST_WORD;
120 token = head;
121 } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
122 state = ST_WORD;
123 token = head;
124 head += strlen(latin1);
125 } else if (line[actual][head] == '&') {
126 state = ST_CHAR_ENTITY;
127 }
128 break;
129 case ST_WORD: // wordchar
130 if ((latin1 = get_latin1(line[actual].c_str() + head))) {
131 head += strlen(latin1);
132 } else if ((is_wordchar((char*)APOSTROPHE) ||
133 (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
134 strncmp(line[actual].c_str() + head, ENTITY_APOS,
135 strlen(ENTITY_APOS)) == 0 &&
136 is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS) )) {
137 head += strlen(ENTITY_APOS) - 1;
138 } else if (is_utf8() &&
139 is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
140 // to the WORDCHARS, if
141 // needed
142 strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_A POS)) ==
143 0 &&
144 is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
145 head += strlen(UTF8_APOS) - 1;
146 } else if (!is_wordchar(line[actual].c_str() + head)) {
147 state = prevstate;
148 if (alloc_token(token, &head, t))
149 return true;
150 }
151 break;
152 case ST_TAG: // comment, labels, etc
153 int i;
154 if ((checkattr == 1) &&
155 ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
156 (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
157 checkattr = 2;
158 } else if ((checkattr > 0) && (line[actual][head] == '>')) {
159 state = ST_NON_WORD;
160 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
161 (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
162 state = ST_NON_WORD;
163 head += strlen(PATTERN[pattern_num][1]) - 1;
164 } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
165 ((line[actual][head] == '"') ||
166 (line[actual][head] == '\''))) {
167 quotmark = line[actual][head];
168 state = ST_ATTRIB;
169 }
170 break;
171 case ST_ATTRIB: // non word chars
172 prevstate = ST_ATTRIB;
173 if (line[actual][head] == quotmark) {
174 state = ST_TAG;
175 if (checkattr == 2)
176 checkattr = 1;
177 // for IMG ALT
178 } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
179 state = ST_WORD;
180 token = head;
181 } else if (line[actual][head] == '&') {
182 state = ST_CHAR_ENTITY;
183 }
184 break;
185 case ST_CHAR_ENTITY: // SGML element
186 if ((tolower(line[actual][head]) == ';')) {
187 state = prevstate;
188 head--;
189 }
190 }
191 if (next_char(line[actual].c_str(), &head))
192 return false;
193 }
194 }
195
196 bool XMLParser::next_token(std::string& t) {
197 return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
198 __PATTERN_LEN2__, t);
199 }
200
201 int XMLParser::change_token(const char* word) {
202 if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
203 strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
204 strchr(word, '>') != NULL) {
205 std::string r(word);
206 mystrrep(r, "&", "__namp;__");
207 mystrrep(r, "__namp;__", "&amp;");
208 mystrrep(r, APOSTROPHE, ENTITY_APOS);
209 mystrrep(r, "\"", "&quot;");
210 mystrrep(r, ">", "&gt;");
211 mystrrep(r, "<", "&lt;");
212 return TextParser::change_token(r.c_str());
213 }
214 return TextParser::change_token(word);
215 }
OLDNEW
« no previous file with comments | « third_party/hunspell/src/parsers/xmlparser.hxx ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698