Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: third_party/hunspell/src/parsers/htmlparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)
Patch Set: Test Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Hunspell, based on MySpell.
15 *
16 * The Initial Developers of the Original Code are
17 * Kevin Hendricks (MySpell) and Németh László (Hunspell).
18 * Portions created by the Initial Developers are Copyright (C) 2002-2005
19 * the Initial Developers. All Rights Reserved.
20 *
21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either the GNU General Public License Version 2 or later (the "GPL"), or
29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40
1 #include <cstdlib> 41 #include <cstdlib>
2 #include <cstring> 42 #include <cstring>
3 #include <cstdio> 43 #include <cstdio>
4 #include <ctype.h> 44 #include <ctype.h>
5 45
6 #include "../hunspell/csutil.hxx" 46 #include "../hunspell/csutil.hxx"
7 #include "htmlparser.hxx" 47 #include "htmlparser.hxx"
8 48
9
10 #ifndef W32 49 #ifndef W32
11 using namespace std; 50 using namespace std;
12 #endif 51 #endif
13 52
14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; 53 static const char* PATTERN[][2] = {{"<script", "</script>"},
54 {"<style", "</style>"},
55 {"<code", "</code>"},
56 {"<samp", "</samp>"},
57 {"<kbd", "</kbd>"},
58 {"<var", "</var>"},
59 {"<listing", "</listing>"},
60 {"<address", "</address>"},
61 {"<pre", "</pre>"},
62 {"<!--", "-->"},
63 {"<[cdata[", "]]>"}, // XML comment
64 {"<", ">"}};
15 65
16 static const char * PATTERN[][2] = { 66 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2))
17 » { "<script", "</script>" },
18 » { "<style", "</style>" },
19 » { "<code", "</code>" },
20 » { "<samp", "</samp>" },
21 » { "<kbd", "</kbd>" },
22 » { "<var", "</var>" },
23 » { "<listing", "</listing>" },
24 » { "<address", "</address>" },
25 » { "<pre", "</pre>" },
26 » { "<!--", "-->" },
27 » { "<[cdata[", "]]>" }, // XML comment
28 » { "<", ">" }
29 };
30 67
31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2)) 68 static const char* PATTERN2[][2] = {
69 {"<img", "alt="}, // ALT and TITLE attrib handled spec.
70 {"<img", "title="},
71 {"<a ", "title="}};
32 72
33 static const char * PATTERN2[][2] = { 73 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char*) * 2))
34 » { "<img", "alt=" }, // ALT and TITLE attrib handled spec.
35 » { "<img", "title=" },
36 » { "<a ", "title=" }
37 };
38 74
39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2)) 75 HTMLParser::HTMLParser(const char* wordchars)
40 76 : XMLParser(wordchars) {
41 HTMLParser::HTMLParser(const char * wordchars)
42 {
43 » init(wordchars);
44 } 77 }
45 78
46 HTMLParser::HTMLParser(unsigned short * wordchars, int len) 79 HTMLParser::HTMLParser(const w_char* wordchars, int len)
47 { 80 : XMLParser(wordchars, len) {
48 » init(wordchars, len);
49 } 81 }
50 82
51 HTMLParser::~HTMLParser() 83 bool HTMLParser::next_token(std::string& t) {
52 { 84 return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, t);
53 } 85 }
54 86
55 87 HTMLParser::~HTMLParser() {}
56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
57 {
58 » for (unsigned int i = 0; i < len; i++) {
59 » » char * j = line[actual] + head;
60 » » const char * k = p[i][column];
61 » » while ((*k != '\0') && (tolower(*j) == *k)) {
62 » » » j++;
63 » » » k++;
64 » » }
65 » » if (*k == '\0') return i;
66 » }
67 » return -1;
68 }
69
70 /*
71 * HTML parser
72 *
73 */
74
75
76 char * HTMLParser::next_token()
77 {
78 » const char * latin1;
79
80 » for (;;) {
81 » » //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[ actual]);
82 » » //getch();
83 » » switch (state)
84 » » {
85 » » case ST_NON_WORD: // non word chars
86 » » » prevstate = ST_NON_WORD;
87 » » » if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0) ) != -1) {
88 » » » » checkattr = 0;
89 » » » » if ((pattern2_num = look_pattern(PATTERN2, PATTE RN_LEN2, 0)) != -1) {
90 » » » » » checkattr = 1;
91 » » » » }
92 » » » » state = ST_TAG;
93 » » » } else if (is_wordchar(line[actual] + head)) {
94 » » » » state = ST_WORD;
95 » » » » token = head;
96 » » » } else if ((latin1 = get_latin1(line[actual] + head))) {
97 » » » » state = ST_WORD;
98 » » » » token = head;
99 » » » » head += strlen(latin1);
100 » » » } else if (line[actual][head] == '&') {
101 » » » » state = ST_CHAR_ENTITY;
102 » » » } » » »
103 » » » break;
104 » » case ST_WORD: // wordchar
105 » » » if ((latin1 = get_latin1(line[actual] + head))) {
106 » » » » head += strlen(latin1);
107 » » » } else if (! is_wordchar(line[actual] + head)) {
108 » » » » state = prevstate;
109 » » » » char * t = alloc_token(token, &head);
110 » » » » if (t) return t;
111 » » » }
112 » » » break;
113 » » case ST_TAG: // comment, labels, etc
114 » » » int i;
115 » » » if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT TERN_LEN2, 1)) != -1)
116 » » » » && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num] [0]) == 0)) {
117 » » » » » checkattr = 2;
118 » » » } else if ((checkattr > 0) && (line[actual][head] == '>' )) {
119 » » » » » state = ST_NON_WORD;
120 » » » } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
121 » » » » (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) = = 0)) {
122 » » » » » state = ST_NON_WORD;
123 » » » » » head += strlen(PATTERN[pattern_num][1]) - 1;
124 » » » } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
125 » » » » ((line[actual][head] == '"') || (line[actual][he ad] == '\''))) {
126 » » » » quotmark = line[actual][head];
127 » » » » state = ST_ATTRIB;
128 » » » }
129 » » » break;
130 » » case ST_ATTRIB: // non word chars
131 » » » prevstate = ST_ATTRIB;
132 » » » if (line[actual][head] == quotmark) {
133 » » » » state = ST_TAG;
134 » » » » if (checkattr == 2) checkattr = 1;
135 » » » // for IMG ALT
136 » » » } else if (is_wordchar(line[actual] + head) && (checkatt r == 2)) {
137 » » » » state = ST_WORD;
138 » » » » token = head;
139 » » » } else if (line[actual][head] == '&') {
140 » » » » state = ST_CHAR_ENTITY;
141 » » » } » » »
142 » » » break;
143 » » case ST_CHAR_ENTITY: // SGML element
144 » » » if ((tolower(line[actual][head]) == ';')) {
145 » » » » state = prevstate;
146 » » » » head--;
147 » » » }
148 » » }
149 if (next_char(line[actual], &head)) return NULL;
150 » }
151 }
OLDNEW
« no previous file with comments | « third_party/hunspell/src/parsers/htmlparser.hxx ('k') | third_party/hunspell/src/parsers/latexparser.hxx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698