| OLD | NEW |
| 1 /* ***** BEGIN LICENSE BLOCK ***** |
| 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| 3 * |
| 4 * The contents of this file are subject to the Mozilla Public License Version |
| 5 * 1.1 (the "License"); you may not use this file except in compliance with |
| 6 * the License. You may obtain a copy of the License at |
| 7 * http://www.mozilla.org/MPL/ |
| 8 * |
| 9 * Software distributed under the License is distributed on an "AS IS" basis, |
| 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 11 * for the specific language governing rights and limitations under the |
| 12 * License. |
| 13 * |
| 14 * The Original Code is Hunspell, based on MySpell. |
| 15 * |
| 16 * The Initial Developers of the Original Code are |
| 17 * Kevin Hendricks (MySpell) and Németh László (Hunspell). |
| 18 * Portions created by the Initial Developers are Copyright (C) 2002-2005 |
| 19 * the Initial Developers. All Rights Reserved. |
| 20 * |
| 21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
| 22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
| 23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
| 24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
| 25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
| 26 * |
| 27 * Alternatively, the contents of this file may be used under the terms of |
| 28 * either the GNU General Public License Version 2 or later (the "GPL"), or |
| 29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| 30 * in which case the provisions of the GPL or the LGPL are applicable instead |
| 31 * of those above. If you wish to allow use of your version of this file only |
| 32 * under the terms of either the GPL or the LGPL, and not to allow others to |
| 33 * use your version of this file under the terms of the MPL, indicate your |
| 34 * decision by deleting the provisions above and replace them with the notice |
| 35 * and other provisions required by the GPL or the LGPL. If you do not delete |
| 36 * the provisions above, a recipient may use your version of this file under |
| 37 * the terms of any one of the MPL, the GPL or the LGPL. |
| 38 * |
| 39 * ***** END LICENSE BLOCK ***** */ |
| 40 |
| 1 #include <cstdlib> | 41 #include <cstdlib> |
| 2 #include <cstring> | 42 #include <cstring> |
| 3 #include <cstdio> | 43 #include <cstdio> |
| 4 #include <ctype.h> | 44 #include <ctype.h> |
| 5 | 45 |
| 6 #include "../hunspell/csutil.hxx" | 46 #include "../hunspell/csutil.hxx" |
| 7 #include "htmlparser.hxx" | 47 #include "htmlparser.hxx" |
| 8 | 48 |
| 9 | |
| 10 #ifndef W32 | 49 #ifndef W32 |
| 11 using namespace std; | 50 using namespace std; |
| 12 #endif | 51 #endif |
| 13 | 52 |
| 14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; | 53 static const char* PATTERN[][2] = {{"<script", "</script>"}, |
| 54 {"<style", "</style>"}, |
| 55 {"<code", "</code>"}, |
| 56 {"<samp", "</samp>"}, |
| 57 {"<kbd", "</kbd>"}, |
| 58 {"<var", "</var>"}, |
| 59 {"<listing", "</listing>"}, |
| 60 {"<address", "</address>"}, |
| 61 {"<pre", "</pre>"}, |
| 62 {"<!--", "-->"}, |
| 63 {"<[cdata[", "]]>"}, // XML comment |
| 64 {"<", ">"}}; |
| 15 | 65 |
| 16 static const char * PATTERN[][2] = { | 66 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2)) |
| 17 » { "<script", "</script>" }, | |
| 18 » { "<style", "</style>" }, | |
| 19 » { "<code", "</code>" }, | |
| 20 » { "<samp", "</samp>" }, | |
| 21 » { "<kbd", "</kbd>" }, | |
| 22 » { "<var", "</var>" }, | |
| 23 » { "<listing", "</listing>" }, | |
| 24 » { "<address", "</address>" }, | |
| 25 » { "<pre", "</pre>" }, | |
| 26 » { "<!--", "-->" }, | |
| 27 » { "<[cdata[", "]]>" }, // XML comment | |
| 28 » { "<", ">" } | |
| 29 }; | |
| 30 | 67 |
| 31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2)) | 68 static const char* PATTERN2[][2] = { |
| 69 {"<img", "alt="}, // ALT and TITLE attrib handled spec. |
| 70 {"<img", "title="}, |
| 71 {"<a ", "title="}}; |
| 32 | 72 |
| 33 static const char * PATTERN2[][2] = { | 73 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char*) * 2)) |
| 34 » { "<img", "alt=" }, // ALT and TITLE attrib handled spec. | |
| 35 » { "<img", "title=" }, | |
| 36 » { "<a ", "title=" } | |
| 37 }; | |
| 38 | 74 |
| 39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2)) | 75 HTMLParser::HTMLParser(const char* wordchars) |
| 40 | 76 : XMLParser(wordchars) { |
| 41 HTMLParser::HTMLParser(const char * wordchars) | |
| 42 { | |
| 43 » init(wordchars); | |
| 44 } | 77 } |
| 45 | 78 |
| 46 HTMLParser::HTMLParser(unsigned short * wordchars, int len) | 79 HTMLParser::HTMLParser(const w_char* wordchars, int len) |
| 47 { | 80 : XMLParser(wordchars, len) { |
| 48 » init(wordchars, len); | |
| 49 } | 81 } |
| 50 | 82 |
| 51 HTMLParser::~HTMLParser() | 83 bool HTMLParser::next_token(std::string& t) { |
| 52 { | 84 return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, t); |
| 53 } | 85 } |
| 54 | 86 |
| 55 | 87 HTMLParser::~HTMLParser() {} |
| 56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column) | |
| 57 { | |
| 58 » for (unsigned int i = 0; i < len; i++) { | |
| 59 » » char * j = line[actual] + head; | |
| 60 » » const char * k = p[i][column]; | |
| 61 » » while ((*k != '\0') && (tolower(*j) == *k)) { | |
| 62 » » » j++; | |
| 63 » » » k++; | |
| 64 » » } | |
| 65 » » if (*k == '\0') return i; | |
| 66 » } | |
| 67 » return -1; | |
| 68 } | |
| 69 | |
| 70 /* | |
| 71 * HTML parser | |
| 72 * | |
| 73 */ | |
| 74 | |
| 75 | |
| 76 char * HTMLParser::next_token() | |
| 77 { | |
| 78 » const char * latin1; | |
| 79 | |
| 80 » for (;;) { | |
| 81 » » //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[
actual]); | |
| 82 » » //getch(); | |
| 83 » » switch (state) | |
| 84 » » { | |
| 85 » » case ST_NON_WORD: // non word chars | |
| 86 » » » prevstate = ST_NON_WORD; | |
| 87 » » » if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)
) != -1) { | |
| 88 » » » » checkattr = 0; | |
| 89 » » » » if ((pattern2_num = look_pattern(PATTERN2, PATTE
RN_LEN2, 0)) != -1) { | |
| 90 » » » » » checkattr = 1; | |
| 91 » » » » } | |
| 92 » » » » state = ST_TAG; | |
| 93 » » » } else if (is_wordchar(line[actual] + head)) { | |
| 94 » » » » state = ST_WORD; | |
| 95 » » » » token = head; | |
| 96 » » » } else if ((latin1 = get_latin1(line[actual] + head))) { | |
| 97 » » » » state = ST_WORD; | |
| 98 » » » » token = head; | |
| 99 » » » » head += strlen(latin1); | |
| 100 » » » } else if (line[actual][head] == '&') { | |
| 101 » » » » state = ST_CHAR_ENTITY; | |
| 102 » » » } » » » | |
| 103 » » » break; | |
| 104 » » case ST_WORD: // wordchar | |
| 105 » » » if ((latin1 = get_latin1(line[actual] + head))) { | |
| 106 » » » » head += strlen(latin1); | |
| 107 » » » } else if (! is_wordchar(line[actual] + head)) { | |
| 108 » » » » state = prevstate; | |
| 109 » » » » char * t = alloc_token(token, &head); | |
| 110 » » » » if (t) return t; | |
| 111 » » » } | |
| 112 » » » break; | |
| 113 » » case ST_TAG: // comment, labels, etc | |
| 114 » » » int i; | |
| 115 » » » if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT
TERN_LEN2, 1)) != -1) | |
| 116 » » » » && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num]
[0]) == 0)) { | |
| 117 » » » » » checkattr = 2; | |
| 118 » » » } else if ((checkattr > 0) && (line[actual][head] == '>'
)) { | |
| 119 » » » » » state = ST_NON_WORD; | |
| 120 » » » } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1))
!= -1) && | |
| 121 » » » » (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) =
= 0)) { | |
| 122 » » » » » state = ST_NON_WORD; | |
| 123 » » » » » head += strlen(PATTERN[pattern_num][1])
- 1; | |
| 124 » » » } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0)
&& | |
| 125 » » » » ((line[actual][head] == '"') || (line[actual][he
ad] == '\''))) { | |
| 126 » » » » quotmark = line[actual][head]; | |
| 127 » » » » state = ST_ATTRIB; | |
| 128 » » » } | |
| 129 » » » break; | |
| 130 » » case ST_ATTRIB: // non word chars | |
| 131 » » » prevstate = ST_ATTRIB; | |
| 132 » » » if (line[actual][head] == quotmark) { | |
| 133 » » » » state = ST_TAG; | |
| 134 » » » » if (checkattr == 2) checkattr = 1; | |
| 135 » » » // for IMG ALT | |
| 136 » » » } else if (is_wordchar(line[actual] + head) && (checkatt
r == 2)) { | |
| 137 » » » » state = ST_WORD; | |
| 138 » » » » token = head; | |
| 139 » » » } else if (line[actual][head] == '&') { | |
| 140 » » » » state = ST_CHAR_ENTITY; | |
| 141 » » » } » » » | |
| 142 » » » break; | |
| 143 » » case ST_CHAR_ENTITY: // SGML element | |
| 144 » » » if ((tolower(line[actual][head]) == ';')) { | |
| 145 » » » » state = prevstate; | |
| 146 » » » » head--; | |
| 147 » » » } | |
| 148 » » } | |
| 149 if (next_char(line[actual], &head)) return NULL; | |
| 150 » } | |
| 151 } | |
| OLD | NEW |