OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 """Flattens a HTML file by inlining its external resources. | |
7 | |
8 This is a small script that takes a HTML file, looks for src attributes | |
9 and inlines the specified file, producing one HTML file with no external | |
10 dependencies. It recursively inlines the included files. | |
11 """ | |
12 | |
13 import os | |
14 import re | |
15 import sys | |
16 import base64 | |
17 import mimetypes | |
18 | |
19 from grit import lazy_re | |
20 from grit import util | |
21 | |
22 # There is a python bug that makes mimetypes crash if the Windows | |
23 # registry contains non-Latin keys ( http://bugs.python.org/issue9291 | |
24 # ). Initing manually and blocking external mime-type databases will | |
25 # prevent that bug and if we add svg manually, it will still give us | |
26 # the data we need. | |
27 mimetypes.init([]) | |
28 mimetypes.add_type('image/svg+xml', '.svg') | |
29 | |
30 DIST_DEFAULT = 'chromium' | |
31 DIST_ENV_VAR = 'CHROMIUM_BUILD' | |
32 DIST_SUBSTR = '%DISTRIBUTION%' | |
33 | |
34 # Matches beginning of an "if" block with trailing spaces. | |
35 _BEGIN_IF_BLOCK = lazy_re.compile( | |
36 '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*') | |
37 | |
38 # Matches ending of an "if" block with preceding spaces. | |
39 _END_IF_BLOCK = lazy_re.compile('\s*</if>') | |
40 | |
41 # Used by DoInline to replace various links with inline content. | |
42 _STYLESHEET_RE = lazy_re.compile( | |
43 '<link rel="stylesheet"[^>]+?href="(?P<filename>[^"]*)".*?>(\s*</link>)?', | |
44 re.DOTALL) | |
45 _INCLUDE_RE = lazy_re.compile( | |
46 '<include[^>]+?src="(?P<filename>[^"\']*)".*?>(\s*</include>)?', | |
47 re.DOTALL) | |
48 _SRC_RE = lazy_re.compile( | |
49 r'<(?!script)(?:[^>]+?\s)src=(?P<quote>")(?!\[\[|{{)(?P<filename>[^"\']*)\1'
, | |
50 re.MULTILINE) | |
51 _ICON_RE = lazy_re.compile( | |
52 r'<link rel="icon"\s(?:[^>]+?\s)?' | |
53 'href=(?P<quote>")(?P<filename>[^"\']*)\1', | |
54 re.MULTILINE) | |
55 | |
56 | |
57 def GetDistribution(): | |
58 """Helper function that gets the distribution we are building. | |
59 | |
60 Returns: | |
61 string | |
62 """ | |
63 distribution = DIST_DEFAULT | |
64 if DIST_ENV_VAR in os.environ.keys(): | |
65 distribution = os.environ[DIST_ENV_VAR] | |
66 if len(distribution) > 1 and distribution[0] == '_': | |
67 distribution = distribution[1:].lower() | |
68 return distribution | |
69 | |
70 | |
71 def SrcInlineAsDataURL( | |
72 src_match, base_path, distribution, inlined_files, names_only=False, | |
73 filename_expansion_function=None): | |
74 """regex replace function. | |
75 | |
76 Takes a regex match for src="filename", attempts to read the file | |
77 at 'filename' and returns the src attribute with the file inlined | |
78 as a data URI. If it finds DIST_SUBSTR string in file name, replaces | |
79 it with distribution. | |
80 | |
81 Args: | |
82 src_match: regex match object with 'filename' and 'quote' named capturing | |
83 groups | |
84 base_path: path that to look for files in | |
85 distribution: string that should replace DIST_SUBSTR | |
86 inlined_files: The name of the opened file is appended to this list. | |
87 names_only: If true, the function will not read the file but just return "". | |
88 It will still add the filename to |inlined_files|. | |
89 | |
90 Returns: | |
91 string | |
92 """ | |
93 filename = src_match.group('filename') | |
94 if filename_expansion_function: | |
95 filename = filename_expansion_function(filename) | |
96 quote = src_match.group('quote') | |
97 | |
98 if filename.find(':') != -1: | |
99 # filename is probably a URL, which we don't want to bother inlining | |
100 return src_match.group(0) | |
101 | |
102 filename = filename.replace(DIST_SUBSTR , distribution) | |
103 filepath = os.path.normpath(os.path.join(base_path, filename)) | |
104 inlined_files.add(filepath) | |
105 | |
106 if names_only: | |
107 return "" | |
108 | |
109 mimetype = mimetypes.guess_type(filename)[0] | |
110 if mimetype is None: | |
111 raise Exception('%s is of an an unknown type and ' | |
112 'cannot be stored in a data url.' % filename) | |
113 inline_data = base64.standard_b64encode(util.ReadFile(filepath, util.BINARY)) | |
114 | |
115 prefix = src_match.string[src_match.start():src_match.start('filename')] | |
116 suffix = src_match.string[src_match.end('filename'):src_match.end()] | |
117 return '%sdata:%s;base64,%s%s' % (prefix, mimetype, inline_data, suffix) | |
118 | |
119 | |
120 class InlinedData: | |
121 """Helper class holding the results from DoInline(). | |
122 | |
123 Holds the inlined data and the set of filenames of all the inlined | |
124 files. | |
125 """ | |
126 def __init__(self, inlined_data, inlined_files): | |
127 self.inlined_data = inlined_data | |
128 self.inlined_files = inlined_files | |
129 | |
130 def DoInline( | |
131 input_filename, grd_node, allow_external_script=False, names_only=False, | |
132 rewrite_function=None, filename_expansion_function=None): | |
133 """Helper function that inlines the resources in a specified file. | |
134 | |
135 Reads input_filename, finds all the src attributes and attempts to | |
136 inline the files they are referring to, then returns the result and | |
137 the set of inlined files. | |
138 | |
139 Args: | |
140 input_filename: name of file to read in | |
141 grd_node: html node from the grd file for this include tag | |
142 names_only: |nil| will be returned for the inlined contents (faster). | |
143 rewrite_function: function(filepath, text, distribution) which will be | |
144 called to rewrite html content before inlining images. | |
145 filename_expansion_function: function(filename) which will be called to | |
146 rewrite filenames before attempting to read them. | |
147 Returns: | |
148 a tuple of the inlined data as a string and the set of filenames | |
149 of all the inlined files | |
150 """ | |
151 if filename_expansion_function: | |
152 input_filename = filename_expansion_function(input_filename) | |
153 input_filepath = os.path.dirname(input_filename) | |
154 distribution = GetDistribution() | |
155 | |
156 # Keep track of all the files we inline. | |
157 inlined_files = set() | |
158 | |
159 def SrcReplace(src_match, filepath=input_filepath, | |
160 inlined_files=inlined_files): | |
161 """Helper function to provide SrcInlineAsDataURL with the base file path""" | |
162 return SrcInlineAsDataURL( | |
163 src_match, filepath, distribution, inlined_files, names_only=names_only, | |
164 filename_expansion_function=filename_expansion_function) | |
165 | |
166 def GetFilepath(src_match, base_path = input_filepath): | |
167 filename = src_match.group('filename') | |
168 | |
169 if filename.find(':') != -1: | |
170 # filename is probably a URL, which we don't want to bother inlining | |
171 return None | |
172 | |
173 filename = filename.replace('%DISTRIBUTION%', distribution) | |
174 if filename_expansion_function: | |
175 filename = filename_expansion_function(filename) | |
176 return os.path.normpath(os.path.join(base_path, filename)) | |
177 | |
178 def IsConditionSatisfied(src_match): | |
179 expression = src_match.group('expression') | |
180 return grd_node is None or grd_node.EvaluateCondition(expression) | |
181 | |
182 def CheckConditionalElements(str): | |
183 """Helper function to conditionally inline inner elements""" | |
184 while True: | |
185 begin_if = _BEGIN_IF_BLOCK.search(str) | |
186 if begin_if is None: | |
187 if _END_IF_BLOCK.search(str) is not None: | |
188 raise Exception('Unmatched </if>') | |
189 return str | |
190 | |
191 condition_satisfied = IsConditionSatisfied(begin_if) | |
192 leading = str[0:begin_if.start()] | |
193 content_start = begin_if.end() | |
194 | |
195 # Find matching "if" block end. | |
196 count = 1 | |
197 pos = begin_if.end() | |
198 while True: | |
199 end_if = _END_IF_BLOCK.search(str, pos) | |
200 if end_if is None: | |
201 raise Exception('Unmatched <if>') | |
202 | |
203 next_if = _BEGIN_IF_BLOCK.search(str, pos) | |
204 if next_if is None or next_if.start() >= end_if.end(): | |
205 count = count - 1 | |
206 if count == 0: | |
207 break | |
208 pos = end_if.end() | |
209 else: | |
210 count = count + 1 | |
211 pos = next_if.end() | |
212 | |
213 content = str[content_start:end_if.start()] | |
214 trailing = str[end_if.end():] | |
215 | |
216 if condition_satisfied: | |
217 str = leading + CheckConditionalElements(content) + trailing | |
218 else: | |
219 str = leading + trailing | |
220 | |
221 def InlineFileContents(src_match, pattern, inlined_files=inlined_files): | |
222 """Helper function to inline external files of various types""" | |
223 filepath = GetFilepath(src_match) | |
224 if filepath is None: | |
225 return src_match.group(0) | |
226 inlined_files.add(filepath) | |
227 | |
228 if names_only: | |
229 inlined_files.update(GetResourceFilenames( | |
230 filepath, | |
231 allow_external_script, | |
232 rewrite_function, | |
233 filename_expansion_function=filename_expansion_function)) | |
234 return "" | |
235 | |
236 return pattern % InlineToString( | |
237 filepath, grd_node, allow_external_script, | |
238 filename_expansion_function=filename_expansion_function) | |
239 | |
240 def InlineIncludeFiles(src_match): | |
241 """Helper function to directly inline generic external files (without | |
242 wrapping them with any kind of tags). | |
243 """ | |
244 return InlineFileContents(src_match, '%s') | |
245 | |
246 def InlineScript(match): | |
247 """Helper function to inline external script files""" | |
248 attrs = (match.group('attrs1') + match.group('attrs2')).strip() | |
249 if attrs: | |
250 attrs = ' ' + attrs | |
251 return InlineFileContents(match, '<script' + attrs + '>%s</script>') | |
252 | |
253 def InlineCSSText(text, css_filepath): | |
254 """Helper function that inlines external resources in CSS text""" | |
255 filepath = os.path.dirname(css_filepath) | |
256 # Allow custom modifications before inlining images. | |
257 if rewrite_function: | |
258 text = rewrite_function(filepath, text, distribution) | |
259 text = InlineCSSImages(text, filepath) | |
260 return InlineCSSImports(text, filepath) | |
261 | |
262 def InlineCSSFile(src_match, pattern, base_path=input_filepath): | |
263 """Helper function to inline external CSS files. | |
264 | |
265 Args: | |
266 src_match: A regular expression match with a named group named "filename". | |
267 pattern: The pattern to replace with the contents of the CSS file. | |
268 base_path: The base path to use for resolving the CSS file. | |
269 | |
270 Returns: | |
271 The text that should replace the reference to the CSS file. | |
272 """ | |
273 filepath = GetFilepath(src_match, base_path) | |
274 if filepath is None: | |
275 return src_match.group(0) | |
276 | |
277 # Even if names_only is set, the CSS file needs to be opened, because it | |
278 # can link to images that need to be added to the file set. | |
279 inlined_files.add(filepath) | |
280 # When resolving CSS files we need to pass in the path so that relative URLs | |
281 # can be resolved. | |
282 return pattern % InlineCSSText(util.ReadFile(filepath, util.BINARY), | |
283 filepath) | |
284 | |
285 def InlineCSSImages(text, filepath=input_filepath): | |
286 """Helper function that inlines external images in CSS backgrounds.""" | |
287 # Replace contents of url() for css attributes: content, background, | |
288 # or *-image. | |
289 return re.sub('(content|background|[\w-]*-image):[^;]*' + | |
290 '(url\((?P<quote1>"|\'|)[^"\'()]*(?P=quote1)\)|' + | |
291 'image-set\(' + | |
292 '([ ]*url\((?P<quote2>"|\'|)[^"\'()]*(?P=quote2)\)' + | |
293 '[ ]*[0-9.]*x[ ]*(,[ ]*)?)+\))', | |
294 lambda m: InlineCSSUrls(m, filepath), | |
295 text) | |
296 | |
297 def InlineCSSUrls(src_match, filepath=input_filepath): | |
298 """Helper function that inlines each url on a CSS image rule match.""" | |
299 # Replace contents of url() references in matches. | |
300 return re.sub('url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)(?P=quote)\)', | |
301 lambda m: SrcReplace(m, filepath), | |
302 src_match.group(0)) | |
303 | |
304 def InlineCSSImports(text, filepath=input_filepath): | |
305 """Helper function that inlines CSS files included via the @import | |
306 directive. | |
307 """ | |
308 return re.sub('@import\s+url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)' + | |
309 '(?P=quote)\);', | |
310 lambda m: InlineCSSFile(m, '%s', filepath), | |
311 text) | |
312 | |
313 | |
314 flat_text = util.ReadFile(input_filename, util.BINARY) | |
315 | |
316 # Check conditional elements, remove unsatisfied ones from the file. We do | |
317 # this twice. The first pass is so that we don't even bother calling | |
318 # InlineScript, InlineCSSFile and InlineIncludeFiles on text we're eventually | |
319 # going to throw out anyway. | |
320 flat_text = CheckConditionalElements(flat_text) | |
321 | |
322 if not allow_external_script: | |
323 # We need to inline css and js before we inline images so that image | |
324 # references gets inlined in the css and js | |
325 flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' + | |
326 '(?P<attrs2>.*?)></script>', | |
327 InlineScript, | |
328 flat_text) | |
329 | |
330 flat_text = _STYLESHEET_RE.sub( | |
331 lambda m: InlineCSSFile(m, '<style>%s</style>'), | |
332 flat_text) | |
333 | |
334 flat_text = _INCLUDE_RE.sub(InlineIncludeFiles, flat_text) | |
335 | |
336 # Check conditional elements, second pass. This catches conditionals in any | |
337 # of the text we just inlined. | |
338 flat_text = CheckConditionalElements(flat_text) | |
339 | |
340 # Allow custom modifications before inlining images. | |
341 if rewrite_function: | |
342 flat_text = rewrite_function(input_filepath, flat_text, distribution) | |
343 | |
344 flat_text = _SRC_RE.sub(SrcReplace, flat_text) | |
345 | |
346 # TODO(arv): Only do this inside <style> tags. | |
347 flat_text = InlineCSSImages(flat_text) | |
348 | |
349 flat_text = _ICON_RE.sub(SrcReplace, flat_text) | |
350 | |
351 if names_only: | |
352 flat_text = None # Will contains garbage if the flag is set anyway. | |
353 return InlinedData(flat_text, inlined_files) | |
354 | |
355 | |
356 def InlineToString(input_filename, grd_node, allow_external_script=False, | |
357 rewrite_function=None, filename_expansion_function=None): | |
358 """Inlines the resources in a specified file and returns it as a string. | |
359 | |
360 Args: | |
361 input_filename: name of file to read in | |
362 grd_node: html node from the grd file for this include tag | |
363 Returns: | |
364 the inlined data as a string | |
365 """ | |
366 try: | |
367 return DoInline( | |
368 input_filename, | |
369 grd_node, | |
370 allow_external_script=allow_external_script, | |
371 rewrite_function=rewrite_function, | |
372 filename_expansion_function=filename_expansion_function).inlined_data | |
373 except IOError, e: | |
374 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % | |
375 (e.filename, input_filename, e.strerror)) | |
376 | |
377 | |
378 def InlineToFile(input_filename, output_filename, grd_node): | |
379 """Inlines the resources in a specified file and writes it. | |
380 | |
381 Reads input_filename, finds all the src attributes and attempts to | |
382 inline the files they are referring to, then writes the result | |
383 to output_filename. | |
384 | |
385 Args: | |
386 input_filename: name of file to read in | |
387 output_filename: name of file to be written to | |
388 grd_node: html node from the grd file for this include tag | |
389 Returns: | |
390 a set of filenames of all the inlined files | |
391 """ | |
392 inlined_data = InlineToString(input_filename, grd_node) | |
393 with open(output_filename, 'wb') as out_file: | |
394 out_file.writelines(inlined_data) | |
395 | |
396 | |
397 def GetResourceFilenames(filename, | |
398 allow_external_script=False, | |
399 rewrite_function=None, | |
400 filename_expansion_function=None): | |
401 """For a grd file, returns a set of all the files that would be inline.""" | |
402 try: | |
403 return DoInline( | |
404 filename, | |
405 None, | |
406 names_only=True, | |
407 allow_external_script=allow_external_script, | |
408 rewrite_function=rewrite_function, | |
409 filename_expansion_function=filename_expansion_function).inlined_files | |
410 except IOError, e: | |
411 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % | |
412 (e.filename, filename, e.strerror)) | |
413 | |
414 | |
415 def main(): | |
416 if len(sys.argv) <= 2: | |
417 print "Flattens a HTML file by inlining its external resources.\n" | |
418 print "html_inline.py inputfile outputfile" | |
419 else: | |
420 InlineToFile(sys.argv[1], sys.argv[2], None) | |
421 | |
422 if __name__ == '__main__': | |
423 main() | |
OLD | NEW |