| OLD | NEW |
| (Empty) | |
| 1 /* ***** BEGIN LICENSE BLOCK ***** |
| 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| 3 * |
| 4 * The contents of this file are subject to the Mozilla Public License Version |
| 5 * 1.1 (the "License"); you may not use this file except in compliance with |
| 6 * the License. You may obtain a copy of the License at |
| 7 * http://www.mozilla.org/MPL/ |
| 8 * |
| 9 * Software distributed under the License is distributed on an "AS IS" basis, |
| 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 11 * for the specific language governing rights and limitations under the |
| 12 * License. |
| 13 * |
| 14 * The Original Code is Hunspell, based on MySpell. |
| 15 * |
| 16 * The Initial Developers of the Original Code are |
| 17 * Kevin Hendricks (MySpell) and Németh László (Hunspell). |
| 18 * Portions created by the Initial Developers are Copyright (C) 2002-2005 |
| 19 * the Initial Developers. All Rights Reserved. |
| 20 * |
| 21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
| 22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
| 23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
| 24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
| 25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
| 26 * |
| 27 * Alternatively, the contents of this file may be used under the terms of |
| 28 * either the GNU General Public License Version 2 or later (the "GPL"), or |
| 29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| 30 * in which case the provisions of the GPL or the LGPL are applicable instead |
| 31 * of those above. If you wish to allow use of your version of this file only |
| 32 * under the terms of either the GPL or the LGPL, and not to allow others to |
| 33 * use your version of this file under the terms of the MPL, indicate your |
| 34 * decision by deleting the provisions above and replace them with the notice |
| 35 * and other provisions required by the GPL or the LGPL. If you do not delete |
| 36 * the provisions above, a recipient may use your version of this file under |
| 37 * the terms of any one of the MPL, the GPL or the LGPL. |
| 38 * |
| 39 * ***** END LICENSE BLOCK ***** */ |
| 40 |
| 41 #include <cstdlib> |
| 42 #include <cstring> |
| 43 #include <cstdio> |
| 44 #include <ctype.h> |
| 45 |
| 46 #include "../hunspell/csutil.hxx" |
| 47 #include "xmlparser.hxx" |
| 48 |
| 49 #ifndef W32 |
| 50 using namespace std; |
| 51 #endif |
| 52 |
| 53 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; |
| 54 |
| 55 static const char* __PATTERN__[][2] = {{"<!--", "-->"}, |
| 56 {"<[cdata[", "]]>"}, // XML comment |
| 57 {"<", ">"}}; |
| 58 |
| 59 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2)) |
| 60 |
| 61 static const char* (*__PATTERN2__)[2] = NULL; |
| 62 |
| 63 #define __PATTERN_LEN2__ 0 |
| 64 |
| 65 #define ENTITY_APOS "'" |
| 66 #define UTF8_APOS "\xe2\x80\x99" |
| 67 #define APOSTROPHE "'" |
| 68 |
| 69 XMLParser::XMLParser(const char* wordchars) |
| 70 : TextParser(wordchars) |
| 71 , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) { |
| 72 } |
| 73 |
| 74 XMLParser::XMLParser(const w_char* wordchars, int len) |
| 75 : TextParser(wordchars, len) |
| 76 , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) { |
| 77 } |
| 78 |
| 79 XMLParser::~XMLParser() {} |
| 80 |
| 81 int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) { |
| 82 for (unsigned int i = 0; i < len; i++) { |
| 83 const char* j = line[actual].c_str() + head; |
| 84 const char* k = p[i][column]; |
| 85 while ((*k != '\0') && (tolower(*j) == *k)) { |
| 86 j++; |
| 87 k++; |
| 88 } |
| 89 if (*k == '\0') |
| 90 return i; |
| 91 } |
| 92 return -1; |
| 93 } |
| 94 |
| 95 /* |
| 96 * XML parser |
| 97 * |
| 98 */ |
| 99 |
| 100 bool XMLParser::next_token(const char* PATTERN[][2], |
| 101 unsigned int PATTERN_LEN, |
| 102 const char* PATTERN2[][2], |
| 103 unsigned int PATTERN_LEN2, |
| 104 std::string& t) { |
| 105 t.clear(); |
| 106 const char* latin1; |
| 107 |
| 108 for (;;) { |
| 109 switch (state) { |
| 110 case ST_NON_WORD: // non word chars |
| 111 prevstate = ST_NON_WORD; |
| 112 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) { |
| 113 checkattr = 0; |
| 114 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) { |
| 115 checkattr = 1; |
| 116 } |
| 117 state = ST_TAG; |
| 118 } else if (is_wordchar(line[actual].c_str() + head)) { |
| 119 state = ST_WORD; |
| 120 token = head; |
| 121 } else if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
| 122 state = ST_WORD; |
| 123 token = head; |
| 124 head += strlen(latin1); |
| 125 } else if (line[actual][head] == '&') { |
| 126 state = ST_CHAR_ENTITY; |
| 127 } |
| 128 break; |
| 129 case ST_WORD: // wordchar |
| 130 if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
| 131 head += strlen(latin1); |
| 132 } else if ((is_wordchar((char*)APOSTROPHE) || |
| 133 (is_utf8() && is_wordchar((char*)UTF8_APOS))) && |
| 134 strncmp(line[actual].c_str() + head, ENTITY_APOS, |
| 135 strlen(ENTITY_APOS)) == 0 && |
| 136 is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS)
)) { |
| 137 head += strlen(ENTITY_APOS) - 1; |
| 138 } else if (is_utf8() && |
| 139 is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe |
| 140 // to the WORDCHARS, if |
| 141 // needed |
| 142 strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_A
POS)) == |
| 143 0 && |
| 144 is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS)))
{ |
| 145 head += strlen(UTF8_APOS) - 1; |
| 146 } else if (!is_wordchar(line[actual].c_str() + head)) { |
| 147 state = prevstate; |
| 148 if (alloc_token(token, &head, t)) |
| 149 return true; |
| 150 } |
| 151 break; |
| 152 case ST_TAG: // comment, labels, etc |
| 153 int i; |
| 154 if ((checkattr == 1) && |
| 155 ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) && |
| 156 (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) { |
| 157 checkattr = 2; |
| 158 } else if ((checkattr > 0) && (line[actual][head] == '>')) { |
| 159 state = ST_NON_WORD; |
| 160 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) && |
| 161 (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) { |
| 162 state = ST_NON_WORD; |
| 163 head += strlen(PATTERN[pattern_num][1]) - 1; |
| 164 } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) && |
| 165 ((line[actual][head] == '"') || |
| 166 (line[actual][head] == '\''))) { |
| 167 quotmark = line[actual][head]; |
| 168 state = ST_ATTRIB; |
| 169 } |
| 170 break; |
| 171 case ST_ATTRIB: // non word chars |
| 172 prevstate = ST_ATTRIB; |
| 173 if (line[actual][head] == quotmark) { |
| 174 state = ST_TAG; |
| 175 if (checkattr == 2) |
| 176 checkattr = 1; |
| 177 // for IMG ALT |
| 178 } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2))
{ |
| 179 state = ST_WORD; |
| 180 token = head; |
| 181 } else if (line[actual][head] == '&') { |
| 182 state = ST_CHAR_ENTITY; |
| 183 } |
| 184 break; |
| 185 case ST_CHAR_ENTITY: // SGML element |
| 186 if ((tolower(line[actual][head]) == ';')) { |
| 187 state = prevstate; |
| 188 head--; |
| 189 } |
| 190 } |
| 191 if (next_char(line[actual].c_str(), &head)) |
| 192 return false; |
| 193 } |
| 194 } |
| 195 |
| 196 bool XMLParser::next_token(std::string& t) { |
| 197 return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__, |
| 198 __PATTERN_LEN2__, t); |
| 199 } |
| 200 |
| 201 int XMLParser::change_token(const char* word) { |
| 202 if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL || |
| 203 strchr(word, '&') != NULL || strchr(word, '<') != NULL || |
| 204 strchr(word, '>') != NULL) { |
| 205 std::string r(word); |
| 206 mystrrep(r, "&", "__namp;__"); |
| 207 mystrrep(r, "__namp;__", "&"); |
| 208 mystrrep(r, APOSTROPHE, ENTITY_APOS); |
| 209 mystrrep(r, "\"", """); |
| 210 mystrrep(r, ">", ">"); |
| 211 mystrrep(r, "<", "<"); |
| 212 return TextParser::change_token(r.c_str()); |
| 213 } |
| 214 return TextParser::change_token(word); |
| 215 } |
| OLD | NEW |