third_party/hunspell/src/parsers/htmlparser.cxx - Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4

Side by Side Diff: third_party/hunspell/src/parsers/htmlparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)

Patch Set: Test Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	1 /* *** BEGIN LICENSE BLOCK ***

	2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1

	3 *

	4 * The contents of this file are subject to the Mozilla Public License Version

	5 * 1.1 (the "License"); you may not use this file except in compliance with

	6 * the License. You may obtain a copy of the License at

	7 * http://www.mozilla.org/MPL/

	8 *

	9 * Software distributed under the License is distributed on an "AS IS" basis,

	10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

	11 * for the specific language governing rights and limitations under the

	12 * License.

	13 *

	14 * The Original Code is Hunspell, based on MySpell.

	15 *

	16 * The Initial Developers of the Original Code are

	17 * Kevin Hendricks (MySpell) and Németh László (Hunspell).

	18 * Portions created by the Initial Developers are Copyright (C) 2002-2005

	19 * the Initial Developers. All Rights Reserved.

	20 *

	21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,

	22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,

	23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,

	24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,

	25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen

	26 *

	27 * Alternatively, the contents of this file may be used under the terms of

	28 * either the GNU General Public License Version 2 or later (the "GPL"), or

	29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),

	30 * in which case the provisions of the GPL or the LGPL are applicable instead

	31 * of those above. If you wish to allow use of your version of this file only

	32 * under the terms of either the GPL or the LGPL, and not to allow others to

	33 * use your version of this file under the terms of the MPL, indicate your

	34 * decision by deleting the provisions above and replace them with the notice

	35 * and other provisions required by the GPL or the LGPL. If you do not delete

	36 * the provisions above, a recipient may use your version of this file under

	37 * the terms of any one of the MPL, the GPL or the LGPL.

	38 *

	39 * *** END LICENSE BLOCK *** */

	40

1 #include <cstdlib>	41 #include <cstdlib>

2 #include <cstring>	42 #include <cstring>

3 #include <cstdio>	43 #include <cstdio>

4 #include <ctype.h>	44 #include <ctype.h>

5	45

6 #include "../hunspell/csutil.hxx"	46 #include "../hunspell/csutil.hxx"

7 #include "htmlparser.hxx"	47 #include "htmlparser.hxx"

8	48

9

10 #ifndef W32	49 #ifndef W32

11 using namespace std;	50 using namespace std;

12 #endif	51 #endif

13	52

14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };	53 static const char* PATTERN[][2] = {{"<script", "</script>"},

	54 {"<style", "</style>"},

	55 {"<code", "</code>"},

	56 {"<samp", "</samp>"},

	57 {"<kbd", "</kbd>"},

	58 {"<var", "</var>"},

	59 {"<listing", "</listing>"},

	60 {"<address", "</address>"},

	61 {"<pre", "</pre>"},

	62 {"<!--", "-->"},

	63 {"<[cdata[", "]]>"}, // XML comment

	64 {"<", ">"}};

15	65

16 static const char * PATTERN[][2] = {	66 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char) 2))

17 » { "<script", "</script>" },

18 » { "<style", "</style>" },

19 » { "<code", "</code>" },

20 » { "<samp", "</samp>" },

21 » { "<kbd", "</kbd>" },

22 » { "<var", "</var>" },

23 » { "<listing", "</listing>" },

24 » { "<address", "</address>" },

25 » { "<pre", "</pre>" },

26 » { "<!--", "-->" },

27 » { "<[cdata[", "]]>" }, // XML comment

28 » { "<", ">" }

29 };

30	67

31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char ) 2))	68 static const char* PATTERN2[][2] = {

	69 {"<img", "alt="}, // ALT and TITLE attrib handled spec.

	70 {"<img", "title="},

	71 {"<a ", "title="}};

32	72

33 static const char * PATTERN2[][2] = {	73 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char) 2))

34 » { "<img", "alt=" }, // ALT and TITLE attrib handled spec.

35 » { "<img", "title=" },

36 » { "<a ", "title=" }

37 };

38	74

39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char ) 2))	75 HTMLParser::HTMLParser(const char* wordchars)

40	76 : XMLParser(wordchars) {

41 HTMLParser::HTMLParser(const char * wordchars)

42 {

43 » init(wordchars);

44 }	77 }

45	78

46 HTMLParser::HTMLParser(unsigned short * wordchars, int len)	79 HTMLParser::HTMLParser(const w_char* wordchars, int len)

47 {	80 : XMLParser(wordchars, len) {

48 » init(wordchars, len);

49 }	81 }

50	82

51 HTMLParser::~HTMLParser()	83 bool HTMLParser::next_token(std::string& t) {

52 {	84 return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, t);

53 }	85 }

54	86

55	87 HTMLParser::~HTMLParser() {}

56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column)

57 {

58 » for (unsigned int i = 0; i < len; i++) {

59 » » char * j = line[actual] + head;

60 » » const char * k = p[i][column];

61 » » while ((k != '\0') && (tolower(j) == *k)) {

62 » » » j++;

63 » » » k++;

64 » » }

65 » » if (*k == '\0') return i;

66 » }

67 » return -1;

68 }

69

70 /*

71 * HTML parser

72 *

73 */

74

75

76 char * HTMLParser::next_token()

77 {

78 » const char * latin1;

79

80 » for (;;) {

81 » » //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[ actual]);

82 » » //getch();

83 » » switch (state)

84 » » {

85 » » case ST_NON_WORD: // non word chars

86 » » » prevstate = ST_NON_WORD;

87 » » » if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0) ) != -1) {

88 » » » » checkattr = 0;

89 » » » » if ((pattern2_num = look_pattern(PATTERN2, PATTE RN_LEN2, 0)) != -1) {

90 » » » » » checkattr = 1;

91 » » » » }

92 » » » » state = ST_TAG;

93 » » » } else if (is_wordchar(line[actual] + head)) {

94 » » » » state = ST_WORD;

95 » » » » token = head;

96 » » » } else if ((latin1 = get_latin1(line[actual] + head))) {

97 » » » » state = ST_WORD;

98 » » » » token = head;

99 » » » » head += strlen(latin1);

100 » » » } else if (line[actual][head] == '&') {

101 » » » » state = ST_CHAR_ENTITY;

102 » » » } » » »

103 » » » break;

104 » » case ST_WORD: // wordchar

105 » » » if ((latin1 = get_latin1(line[actual] + head))) {

106 » » » » head += strlen(latin1);

107 » » » } else if (! is_wordchar(line[actual] + head)) {

108 » » » » state = prevstate;

109 » » » » char * t = alloc_token(token, &head);

110 » » » » if (t) return t;

111 » » » }

112 » » » break;

113 » » case ST_TAG: // comment, labels, etc

114 » » » int i;

115 » » » if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT TERN_LEN2, 1)) != -1)

116 » » » » && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num] [0]) == 0)) {

117 » » » » » checkattr = 2;

118 » » » } else if ((checkattr > 0) && (line[actual][head] == '>' )) {

119 » » » » » state = ST_NON_WORD;

120 » » » } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&

121 » » » » (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) = = 0)) {

122 » » » » » state = ST_NON_WORD;

123 » » » » » head += strlen(PATTERN[pattern_num][1]) - 1;

124 » » » } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&

125 » » » » ((line[actual][head] == '"') \|\| (line[actual][he ad] == '\''))) {

126 » » » » quotmark = line[actual][head];

127 » » » » state = ST_ATTRIB;

128 » » » }

129 » » » break;

130 » » case ST_ATTRIB: // non word chars

131 » » » prevstate = ST_ATTRIB;

132 » » » if (line[actual][head] == quotmark) {

133 » » » » state = ST_TAG;

134 » » » » if (checkattr == 2) checkattr = 1;

135 » » » // for IMG ALT

136 » » » } else if (is_wordchar(line[actual] + head) && (checkatt r == 2)) {

137 » » » » state = ST_WORD;

138 » » » » token = head;

139 » » » } else if (line[actual][head] == '&') {

140 » » » » state = ST_CHAR_ENTITY;

141 » » » } » » »

142 » » » break;

143 » » case ST_CHAR_ENTITY: // SGML element

144 » » » if ((tolower(line[actual][head]) == ';')) {

145 » » » » state = prevstate;

146 » » » » head--;

147 » » » }

148 » » }

149 if (next_char(line[actual], &head)) return NULL;

150 » }

151 }

OLD	NEW

« no previous file with comments | « third_party/hunspell/src/parsers/htmlparser.hxx ('k') | third_party/hunspell/src/parsers/latexparser.hxx » ('j') | no next file with comments »