trunk/src/third_party/markdown/preprocessors.py - Issue 132753002: Revert 243980 "Docserver: Support markdown for HTML content."

Side by Side Diff: trunk/src/third_party/markdown/preprocessors.py

Issue 132753002: Revert 243980 "Docserver: Support markdown for HTML content." (Closed) Base URL: svn://svn.chromium.org/chrome/

Patch Set: Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 """

2 PRE-PROCESSORS

3 =============================================================================

4

5 Preprocessors work on source text before we start doing anything too

6 complicated.

7 """

8

9 from __future__ import absolute_import

10 from __future__ import unicode_literals

11 from . import util

12 from . import odict

13 import re

14

15

16 def build_preprocessors(md_instance, **kwargs):

17 """ Build the default set of preprocessors used by Markdown. """

18 preprocessors = odict.OrderedDict()

19 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)

20 if md_instance.safeMode != 'escape':

21 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)

22 preprocessors["reference"] = ReferencePreprocessor(md_instance)

23 return preprocessors

24

25

26 class Preprocessor(util.Processor):

27 """

28 Preprocessors are run after the text is broken into lines.

29

30 Each preprocessor implements a "run" method that takes a pointer to a

31 list of lines of the document, modifies it as necessary and returns

32 either the same pointer or a pointer to a new list.

33

34 Preprocessors must extend markdown.Preprocessor.

35

36 """

37 def run(self, lines):

38 """

39 Each subclass of Preprocessor should override the `run` method, which

40 takes the document as a list of strings split by newlines and returns

41 the (possibly modified) list of lines.

42

43 """

44 pass

45

46

47 class NormalizeWhitespace(Preprocessor):

48 """ Normalize whitespace for consistant parsing. """

49

50 def run(self, lines):

51 source = '\n'.join(lines)

52 source = source.replace(util.STX, "").replace(util.ETX, "")

53 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"

54 source = source.expandtabs(self.markdown.tab_length)

55 source = re.sub(r'(?<=\n) +\n', '\n', source)

56 return source.split('\n')

57

58

59 class HtmlBlockPreprocessor(Preprocessor):

60 """Remove html blocks from the text and store them for later retrieval."""

61

62 right_tag_patterns = ["</%s>", "%s>"]

63 attrs_pattern = r"""

64 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"

65 \| # OR

66 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value

67 \| # OR

68 \s+(?P<attr2>[^>"'/= ]+) # attr

69 """

70 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % attrs_pa ttern

71 attrs_re = re.compile(attrs_pattern, re.VERBOSE)

72 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)

73 markdown_in_raw = False

74

75 def _get_left_tag(self, block):

76 m = self.left_tag_re.match(block)

77 if m:

78 tag = m.group('tag')

79 raw_attrs = m.group('attrs')

80 attrs = {}

81 if raw_attrs:

82 for ma in self.attrs_re.finditer(raw_attrs):

83 if ma.group('attr'):

84 if ma.group('value'):

85 attrs[ma.group('attr').strip()] = ma.group('value')

86 else:

87 attrs[ma.group('attr').strip()] = ""

88 elif ma.group('attr1'):

89 if ma.group('value1'):

90 attrs[ma.group('attr1').strip()] = ma.group('value1' )

91 else:

92 attrs[ma.group('attr1').strip()] = ""

93 elif ma.group('attr2'):

94 attrs[ma.group('attr2').strip()] = ""

95 return tag, len(m.group(0)), attrs

96 else:

97 tag = block[1:].split(">", 1)[0].lower()

98 return tag, len(tag)+2, {}

99

100 def _recursive_tagfind(self, ltag, rtag, start_index, block):

101 while 1:

102 i = block.find(rtag, start_index)

103 if i == -1:

104 return -1

105 j = block.find(ltag, start_index)

106 # if no ltag, or rtag found before another ltag, return index

107 if (j > i or j == -1):

108 return i + len(rtag)

109 # another ltag found before rtag, use end of ltag as starting

110 # point and search again

111 j = block.find('>', j)

112 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)

113 if start_index == -1:

114 # HTML potentially malformed- ltag has no corresponding

115 # rtag

116 return -1

117

118 def _get_right_tag(self, left_tag, left_index, block):

119 for p in self.right_tag_patterns:

120 tag = p % left_tag

121 i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block )

122 if i > 2:

123 return tag.lstrip("<").rstrip(">"), i

124 return block.rstrip()[-left_index:-1].lower(), len(block)

125

126 def _equal_tags(self, left_tag, right_tag):

127 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.

128 return True

129 if ("/" + left_tag) == right_tag:

130 return True

131 if (right_tag == "--" and left_tag == "--"):

132 return True

133 elif left_tag == right_tag[1:] \

134 and right_tag[0] == "/":

135 return True

136 else:

137 return False

138

139 def _is_oneliner(self, tag):

140 return (tag in ['hr', 'hr/'])

141

142 def run(self, lines):

143 text = "\n".join(lines)

144 new_blocks = []

145 text = text.rsplit("\n\n")

146 items = []

147 left_tag = ''

148 right_tag = ''

149 in_tag = False # flag

150

151 while text:

152 block = text[0]

153 if block.startswith("\n"):

154 block = block[1:]

155 text = text[1:]

156

157 if block.startswith("\n"):

158 block = block[1:]

159

160 if not in_tag:

161 if block.startswith("<") and len(block.strip()) > 1:

162

163 if block[1] == "!":

164 # is a comment block

165 left_tag, left_index, attrs = "--", 2, {}

166 else:

167 left_tag, left_index, attrs = self._get_left_tag(block)

168 right_tag, data_index = self._get_right_tag(left_tag,

169 left_index,

170 block)

171 # keep checking conditions below and maybe just append

172

173 if data_index < len(block) \

174 and (util.isBlockLevel(left_tag)

175 or left_tag == '--'):

176 text.insert(0, block[data_index:])

177 block = block[:data_index]

178

179 if not (util.isBlockLevel(left_tag) \

180 or block[1] in ["!", "?", "@", "%"]):

181 new_blocks.append(block)

182 continue

183

184 if self._is_oneliner(left_tag):

185 new_blocks.append(block.strip())

186 continue

187

188 if block.rstrip().endswith(">") \

189 and self._equal_tags(left_tag, right_tag):

190 if self.markdown_in_raw and 'markdown' in attrs.keys():

191 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',

192 '', block[:left_index])

193 end = block[-len(right_tag)-2:]

194 block = block[left_index:-len(right_tag)-2]

195 new_blocks.append(

196 self.markdown.htmlStash.store(start))

197 new_blocks.append(block)

198 new_blocks.append(

199 self.markdown.htmlStash.store(end))

200 else:

201 new_blocks.append(

202 self.markdown.htmlStash.store(block.strip()))

203 continue

204 else:

205 # if is block level tag and is not complete

206

207 if util.isBlockLevel(left_tag) or left_tag == "--" \

208 and not block.rstrip().endswith(">"):

209 items.append(block.strip())

210 in_tag = True

211 else:

212 new_blocks.append(

213 self.markdown.htmlStash.store(block.strip()))

214

215 continue

216

217 new_blocks.append(block)

218

219 else:

220 items.append(block)

221

222 right_tag, data_index = self._get_right_tag(left_tag, 0, block)

223

224 if self._equal_tags(left_tag, right_tag):

225 # if find closing tag

226

227 if data_index < len(block):

228 # we have more text after right_tag

229 items[-1] = block[:data_index]

230 text.insert(0, block[data_index:])

231

232 in_tag = False

233 if self.markdown_in_raw and 'markdown' in attrs.keys():

234 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',

235 '', items[0][:left_index])

236 items[0] = items[0][left_index:]

237 end = items[-1][-len(right_tag)-2:]

238 items[-1] = items[-1][:-len(right_tag)-2]

239 new_blocks.append(

240 self.markdown.htmlStash.store(start))

241 new_blocks.extend(items)

242 new_blocks.append(

243 self.markdown.htmlStash.store(end))

244 else:

245 new_blocks.append(

246 self.markdown.htmlStash.store('\n\n'.join(items)))

247 items = []

248

249 if items:

250 if self.markdown_in_raw and 'markdown' in attrs.keys():

251 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',

252 '', items[0][:left_index])

253 items[0] = items[0][left_index:]

254 end = items[-1][-len(right_tag)-2:]

255 items[-1] = items[-1][:-len(right_tag)-2]

256 new_blocks.append(

257 self.markdown.htmlStash.store(start))

258 new_blocks.extend(items)

259 if end.strip():

260 new_blocks.append(

261 self.markdown.htmlStash.store(end))

262 else:

263 new_blocks.append(

264 self.markdown.htmlStash.store('\n\n'.join(items)))

265 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)) )

266 new_blocks.append('\n')

267

268 new_text = "\n\n".join(new_blocks)

269 return new_text.split("\n")

270

271

272 class ReferencePreprocessor(Preprocessor):

273 """ Remove reference definitions from text and store for later use. """

274

275 TITLE = r'[ ](\"(.)\"\|\'(.)\'\|$(.)$)[ ]*'

276 RE = re.compile(r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DO TALL)

277 TITLE_RE = re.compile(r'^%s$' % TITLE)

278

279 def run (self, lines):

280 new_text = [];

281 while lines:

282 line = lines.pop(0)

283 m = self.RE.match(line)

284 if m:

285 id = m.group(1).strip().lower()

286 link = m.group(2).lstrip('<').rstrip('>')

287 t = m.group(5) or m.group(6) or m.group(7)

288 if not t:

289 # Check next line for title

290 tm = self.TITLE_RE.match(lines[0])

291 if tm:

292 lines.pop(0)

293 t = tm.group(2) or tm.group(3) or tm.group(4)

294 self.markdown.references[id] = (link, t)

295 else:

296 new_text.append(line)

297

298 return new_text #+ "\n"

OLD	NEW

« no previous file with comments | « trunk/src/third_party/markdown/postprocessors.py ('k') | trunk/src/third_party/markdown/serializers.py » ('j') | no next file with comments »