third_party/hunspell_new/src/parsers/htmlparser.cxx - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/src/parsers/htmlparser.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #include <cstdlib>

2 #include <cstring>

3 #include <cstdio>

4 #include <ctype.h>

5

6 #include "../hunspell/csutil.hxx"

7 #include "htmlparser.hxx"

8

9

10 #ifndef W32

11 using namespace std;

12 #endif

13

14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };

15

16 static const char * PATTERN[][2] = {

17 { "<script", "</script>" },

18 { "<style", "</style>" },

19 { "<code", "</code>" },

20 { "<samp", "</samp>" },

21 { "<kbd", "</kbd>" },

22 { "<var", "</var>" },

23 { "<listing", "</listing>" },

24 { "<address", "</address>" },

25 { "<pre", "</pre>" },

26 { "<!--", "-->" },

27 { "<[cdata[", "]]>" }, // XML comment

28 { "<", ">" }

29 };

30

31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char ) 2))

32

33 static const char * PATTERN2[][2] = {

34 { "<img", "alt=" }, // ALT and TITLE attrib handled spec.

35 { "<img", "title=" },

36 { "<a ", "title=" }

37 };

38

39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char ) 2))

40

41 HTMLParser::HTMLParser(const char * wordchars)

42 {

43 init(wordchars);

44 }

45

46 HTMLParser::HTMLParser(unsigned short * wordchars, int len)

47 {

48 init(wordchars, len);

49 }

50

51 HTMLParser::~HTMLParser()

52 {

53 }

54

55

56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column)

57 {

58 for (unsigned int i = 0; i < len; i++) {

59 char * j = line[actual] + head;

60 const char * k = p[i][column];

61 while ((k != '\0') && (tolower(j) == *k)) {

62 j++;

63 k++;

64 }

65 if (*k == '\0') return i;

66 }

67 return -1;

68 }

69

70 /*

71 * HTML parser

72 *

73 */

74

75

76 char * HTMLParser::next_token()

77 {

78 const char * latin1;

79

80 for (;;) {

81 //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[ actual]);

82 //getch();

83 switch (state)

84 {

85 case ST_NON_WORD: // non word chars

86 prevstate = ST_NON_WORD;

87 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0) ) != -1) {

88 checkattr = 0;

89 if ((pattern2_num = look_pattern(PATTERN2, PATTE RN_LEN2, 0)) != -1) {

90 checkattr = 1;

91 }

92 state = ST_TAG;

93 } else if (is_wordchar(line[actual] + head)) {

94 state = ST_WORD;

95 token = head;

96 } else if ((latin1 = get_latin1(line[actual] + head))) {

97 state = ST_WORD;

98 token = head;

99 head += strlen(latin1);

100 } else if (line[actual][head] == '&') {

101 state = ST_CHAR_ENTITY;

102 }

103 break;

104 case ST_WORD: // wordchar

105 if ((latin1 = get_latin1(line[actual] + head))) {

106 head += strlen(latin1);

107 } else if (! is_wordchar(line[actual] + head)) {

108 state = prevstate;

109 char * t = alloc_token(token, &head);

110 if (t) return t;

111 }

112 break;

113 case ST_TAG: // comment, labels, etc

114 int i;

115 if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PAT TERN_LEN2, 1)) != -1)

116 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num] [0]) == 0)) {

117 checkattr = 2;

118 } else if ((checkattr > 0) && (line[actual][head] == '>' )) {

119 state = ST_NON_WORD;

120 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&

121 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) = = 0)) {

122 state = ST_NON_WORD;

123 head += strlen(PATTERN[pattern_num][1]) - 1;

124 } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&

125 ((line[actual][head] == '"') \|\| (line[actual][he ad] == '\''))) {

126 quotmark = line[actual][head];

127 state = ST_ATTRIB;

128 }

129 break;

130 case ST_ATTRIB: // non word chars

131 prevstate = ST_ATTRIB;

132 if (line[actual][head] == quotmark) {

133 state = ST_TAG;

134 if (checkattr == 2) checkattr = 1;

135 // for IMG ALT

136 } else if (is_wordchar(line[actual] + head) && (checkatt r == 2)) {

137 state = ST_WORD;

138 token = head;

139 } else if (line[actual][head] == '&') {

140 state = ST_CHAR_ENTITY;

141 }

142 break;

143 case ST_CHAR_ENTITY: // SGML element

144 if ((tolower(line[actual][head]) == ';')) {

145 state = prevstate;

146 head--;

147 }

148 }

149 if (next_char(line[actual], &head)) return NULL;

150 }

151 }

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/src/parsers/htmlparser.hxx ('k') | third_party/hunspell_new/src/parsers/latexparser.hxx » ('j') | no next file with comments »