| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env python | |
| 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 """Flattens a HTML file by inlining its external resources. | |
| 7 | |
| 8 This is a small script that takes a HTML file, looks for src attributes | |
| 9 and inlines the specified file, producing one HTML file with no external | |
| 10 dependencies. It recursively inlines the included files. | |
| 11 """ | |
| 12 | |
| 13 import os | |
| 14 import re | |
| 15 import sys | |
| 16 import base64 | |
| 17 import mimetypes | |
| 18 | |
| 19 from grit import lazy_re | |
| 20 from grit import util | |
| 21 | |
| 22 # There is a python bug that makes mimetypes crash if the Windows | |
| 23 # registry contains non-Latin keys ( http://bugs.python.org/issue9291 | |
| 24 # ). Initing manually and blocking external mime-type databases will | |
| 25 # prevent that bug and if we add svg manually, it will still give us | |
| 26 # the data we need. | |
| 27 mimetypes.init([]) | |
| 28 mimetypes.add_type('image/svg+xml', '.svg') | |
| 29 | |
| 30 DIST_DEFAULT = 'chromium' | |
| 31 DIST_ENV_VAR = 'CHROMIUM_BUILD' | |
| 32 DIST_SUBSTR = '%DISTRIBUTION%' | |
| 33 | |
| 34 # Matches beginning of an "if" block with trailing spaces. | |
| 35 _BEGIN_IF_BLOCK = lazy_re.compile( | |
| 36 '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*') | |
| 37 | |
| 38 # Matches ending of an "if" block with preceding spaces. | |
| 39 _END_IF_BLOCK = lazy_re.compile('\s*</if>') | |
| 40 | |
| 41 # Used by DoInline to replace various links with inline content. | |
| 42 _STYLESHEET_RE = lazy_re.compile( | |
| 43 '<link rel="stylesheet"[^>]+?href="(?P<filename>[^"]*)".*?>(\s*</link>)?', | |
| 44 re.DOTALL) | |
| 45 _INCLUDE_RE = lazy_re.compile( | |
| 46 '<include[^>]+?src="(?P<filename>[^"\']*)".*?>(\s*</include>)?', | |
| 47 re.DOTALL) | |
| 48 _SRC_RE = lazy_re.compile( | |
| 49 r'<(?!script)(?:[^>]+?\s)src=(?P<quote>")(?!\[\[|{{)(?P<filename>[^"\']*)\1'
, | |
| 50 re.MULTILINE) | |
| 51 _ICON_RE = lazy_re.compile( | |
| 52 r'<link rel="icon"\s(?:[^>]+?\s)?' | |
| 53 'href=(?P<quote>")(?P<filename>[^"\']*)\1', | |
| 54 re.MULTILINE) | |
| 55 | |
| 56 | |
| 57 def GetDistribution(): | |
| 58 """Helper function that gets the distribution we are building. | |
| 59 | |
| 60 Returns: | |
| 61 string | |
| 62 """ | |
| 63 distribution = DIST_DEFAULT | |
| 64 if DIST_ENV_VAR in os.environ.keys(): | |
| 65 distribution = os.environ[DIST_ENV_VAR] | |
| 66 if len(distribution) > 1 and distribution[0] == '_': | |
| 67 distribution = distribution[1:].lower() | |
| 68 return distribution | |
| 69 | |
| 70 | |
| 71 def SrcInlineAsDataURL( | |
| 72 src_match, base_path, distribution, inlined_files, names_only=False, | |
| 73 filename_expansion_function=None): | |
| 74 """regex replace function. | |
| 75 | |
| 76 Takes a regex match for src="filename", attempts to read the file | |
| 77 at 'filename' and returns the src attribute with the file inlined | |
| 78 as a data URI. If it finds DIST_SUBSTR string in file name, replaces | |
| 79 it with distribution. | |
| 80 | |
| 81 Args: | |
| 82 src_match: regex match object with 'filename' and 'quote' named capturing | |
| 83 groups | |
| 84 base_path: path that to look for files in | |
| 85 distribution: string that should replace DIST_SUBSTR | |
| 86 inlined_files: The name of the opened file is appended to this list. | |
| 87 names_only: If true, the function will not read the file but just return "". | |
| 88 It will still add the filename to |inlined_files|. | |
| 89 | |
| 90 Returns: | |
| 91 string | |
| 92 """ | |
| 93 filename = src_match.group('filename') | |
| 94 if filename_expansion_function: | |
| 95 filename = filename_expansion_function(filename) | |
| 96 quote = src_match.group('quote') | |
| 97 | |
| 98 if filename.find(':') != -1: | |
| 99 # filename is probably a URL, which we don't want to bother inlining | |
| 100 return src_match.group(0) | |
| 101 | |
| 102 filename = filename.replace(DIST_SUBSTR , distribution) | |
| 103 filepath = os.path.normpath(os.path.join(base_path, filename)) | |
| 104 inlined_files.add(filepath) | |
| 105 | |
| 106 if names_only: | |
| 107 return "" | |
| 108 | |
| 109 mimetype = mimetypes.guess_type(filename)[0] | |
| 110 if mimetype is None: | |
| 111 raise Exception('%s is of an an unknown type and ' | |
| 112 'cannot be stored in a data url.' % filename) | |
| 113 inline_data = base64.standard_b64encode(util.ReadFile(filepath, util.BINARY)) | |
| 114 | |
| 115 prefix = src_match.string[src_match.start():src_match.start('filename')] | |
| 116 suffix = src_match.string[src_match.end('filename'):src_match.end()] | |
| 117 return '%sdata:%s;base64,%s%s' % (prefix, mimetype, inline_data, suffix) | |
| 118 | |
| 119 | |
| 120 class InlinedData: | |
| 121 """Helper class holding the results from DoInline(). | |
| 122 | |
| 123 Holds the inlined data and the set of filenames of all the inlined | |
| 124 files. | |
| 125 """ | |
| 126 def __init__(self, inlined_data, inlined_files): | |
| 127 self.inlined_data = inlined_data | |
| 128 self.inlined_files = inlined_files | |
| 129 | |
| 130 def DoInline( | |
| 131 input_filename, grd_node, allow_external_script=False, names_only=False, | |
| 132 rewrite_function=None, filename_expansion_function=None): | |
| 133 """Helper function that inlines the resources in a specified file. | |
| 134 | |
| 135 Reads input_filename, finds all the src attributes and attempts to | |
| 136 inline the files they are referring to, then returns the result and | |
| 137 the set of inlined files. | |
| 138 | |
| 139 Args: | |
| 140 input_filename: name of file to read in | |
| 141 grd_node: html node from the grd file for this include tag | |
| 142 names_only: |nil| will be returned for the inlined contents (faster). | |
| 143 rewrite_function: function(filepath, text, distribution) which will be | |
| 144 called to rewrite html content before inlining images. | |
| 145 filename_expansion_function: function(filename) which will be called to | |
| 146 rewrite filenames before attempting to read them. | |
| 147 Returns: | |
| 148 a tuple of the inlined data as a string and the set of filenames | |
| 149 of all the inlined files | |
| 150 """ | |
| 151 if filename_expansion_function: | |
| 152 input_filename = filename_expansion_function(input_filename) | |
| 153 input_filepath = os.path.dirname(input_filename) | |
| 154 distribution = GetDistribution() | |
| 155 | |
| 156 # Keep track of all the files we inline. | |
| 157 inlined_files = set() | |
| 158 | |
| 159 def SrcReplace(src_match, filepath=input_filepath, | |
| 160 inlined_files=inlined_files): | |
| 161 """Helper function to provide SrcInlineAsDataURL with the base file path""" | |
| 162 return SrcInlineAsDataURL( | |
| 163 src_match, filepath, distribution, inlined_files, names_only=names_only, | |
| 164 filename_expansion_function=filename_expansion_function) | |
| 165 | |
| 166 def GetFilepath(src_match, base_path = input_filepath): | |
| 167 filename = src_match.group('filename') | |
| 168 | |
| 169 if filename.find(':') != -1: | |
| 170 # filename is probably a URL, which we don't want to bother inlining | |
| 171 return None | |
| 172 | |
| 173 filename = filename.replace('%DISTRIBUTION%', distribution) | |
| 174 if filename_expansion_function: | |
| 175 filename = filename_expansion_function(filename) | |
| 176 return os.path.normpath(os.path.join(base_path, filename)) | |
| 177 | |
| 178 def IsConditionSatisfied(src_match): | |
| 179 expression = src_match.group('expression') | |
| 180 return grd_node is None or grd_node.EvaluateCondition(expression) | |
| 181 | |
| 182 def CheckConditionalElements(str): | |
| 183 """Helper function to conditionally inline inner elements""" | |
| 184 while True: | |
| 185 begin_if = _BEGIN_IF_BLOCK.search(str) | |
| 186 if begin_if is None: | |
| 187 if _END_IF_BLOCK.search(str) is not None: | |
| 188 raise Exception('Unmatched </if>') | |
| 189 return str | |
| 190 | |
| 191 condition_satisfied = IsConditionSatisfied(begin_if) | |
| 192 leading = str[0:begin_if.start()] | |
| 193 content_start = begin_if.end() | |
| 194 | |
| 195 # Find matching "if" block end. | |
| 196 count = 1 | |
| 197 pos = begin_if.end() | |
| 198 while True: | |
| 199 end_if = _END_IF_BLOCK.search(str, pos) | |
| 200 if end_if is None: | |
| 201 raise Exception('Unmatched <if>') | |
| 202 | |
| 203 next_if = _BEGIN_IF_BLOCK.search(str, pos) | |
| 204 if next_if is None or next_if.start() >= end_if.end(): | |
| 205 count = count - 1 | |
| 206 if count == 0: | |
| 207 break | |
| 208 pos = end_if.end() | |
| 209 else: | |
| 210 count = count + 1 | |
| 211 pos = next_if.end() | |
| 212 | |
| 213 content = str[content_start:end_if.start()] | |
| 214 trailing = str[end_if.end():] | |
| 215 | |
| 216 if condition_satisfied: | |
| 217 str = leading + CheckConditionalElements(content) + trailing | |
| 218 else: | |
| 219 str = leading + trailing | |
| 220 | |
| 221 def InlineFileContents(src_match, pattern, inlined_files=inlined_files): | |
| 222 """Helper function to inline external files of various types""" | |
| 223 filepath = GetFilepath(src_match) | |
| 224 if filepath is None: | |
| 225 return src_match.group(0) | |
| 226 inlined_files.add(filepath) | |
| 227 | |
| 228 if names_only: | |
| 229 inlined_files.update(GetResourceFilenames( | |
| 230 filepath, | |
| 231 allow_external_script, | |
| 232 rewrite_function, | |
| 233 filename_expansion_function=filename_expansion_function)) | |
| 234 return "" | |
| 235 | |
| 236 return pattern % InlineToString( | |
| 237 filepath, grd_node, allow_external_script, | |
| 238 filename_expansion_function=filename_expansion_function) | |
| 239 | |
| 240 def InlineIncludeFiles(src_match): | |
| 241 """Helper function to directly inline generic external files (without | |
| 242 wrapping them with any kind of tags). | |
| 243 """ | |
| 244 return InlineFileContents(src_match, '%s') | |
| 245 | |
| 246 def InlineScript(match): | |
| 247 """Helper function to inline external script files""" | |
| 248 attrs = (match.group('attrs1') + match.group('attrs2')).strip() | |
| 249 if attrs: | |
| 250 attrs = ' ' + attrs | |
| 251 return InlineFileContents(match, '<script' + attrs + '>%s</script>') | |
| 252 | |
| 253 def InlineCSSText(text, css_filepath): | |
| 254 """Helper function that inlines external resources in CSS text""" | |
| 255 filepath = os.path.dirname(css_filepath) | |
| 256 # Allow custom modifications before inlining images. | |
| 257 if rewrite_function: | |
| 258 text = rewrite_function(filepath, text, distribution) | |
| 259 text = InlineCSSImages(text, filepath) | |
| 260 return InlineCSSImports(text, filepath) | |
| 261 | |
| 262 def InlineCSSFile(src_match, pattern, base_path=input_filepath): | |
| 263 """Helper function to inline external CSS files. | |
| 264 | |
| 265 Args: | |
| 266 src_match: A regular expression match with a named group named "filename". | |
| 267 pattern: The pattern to replace with the contents of the CSS file. | |
| 268 base_path: The base path to use for resolving the CSS file. | |
| 269 | |
| 270 Returns: | |
| 271 The text that should replace the reference to the CSS file. | |
| 272 """ | |
| 273 filepath = GetFilepath(src_match, base_path) | |
| 274 if filepath is None: | |
| 275 return src_match.group(0) | |
| 276 | |
| 277 # Even if names_only is set, the CSS file needs to be opened, because it | |
| 278 # can link to images that need to be added to the file set. | |
| 279 inlined_files.add(filepath) | |
| 280 # When resolving CSS files we need to pass in the path so that relative URLs | |
| 281 # can be resolved. | |
| 282 return pattern % InlineCSSText(util.ReadFile(filepath, util.BINARY), | |
| 283 filepath) | |
| 284 | |
| 285 def InlineCSSImages(text, filepath=input_filepath): | |
| 286 """Helper function that inlines external images in CSS backgrounds.""" | |
| 287 # Replace contents of url() for css attributes: content, background, | |
| 288 # or *-image. | |
| 289 return re.sub('(content|background|[\w-]*-image):[^;]*' + | |
| 290 '(url\((?P<quote1>"|\'|)[^"\'()]*(?P=quote1)\)|' + | |
| 291 'image-set\(' + | |
| 292 '([ ]*url\((?P<quote2>"|\'|)[^"\'()]*(?P=quote2)\)' + | |
| 293 '[ ]*[0-9.]*x[ ]*(,[ ]*)?)+\))', | |
| 294 lambda m: InlineCSSUrls(m, filepath), | |
| 295 text) | |
| 296 | |
| 297 def InlineCSSUrls(src_match, filepath=input_filepath): | |
| 298 """Helper function that inlines each url on a CSS image rule match.""" | |
| 299 # Replace contents of url() references in matches. | |
| 300 return re.sub('url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)(?P=quote)\)', | |
| 301 lambda m: SrcReplace(m, filepath), | |
| 302 src_match.group(0)) | |
| 303 | |
| 304 def InlineCSSImports(text, filepath=input_filepath): | |
| 305 """Helper function that inlines CSS files included via the @import | |
| 306 directive. | |
| 307 """ | |
| 308 return re.sub('@import\s+url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)' + | |
| 309 '(?P=quote)\);', | |
| 310 lambda m: InlineCSSFile(m, '%s', filepath), | |
| 311 text) | |
| 312 | |
| 313 | |
| 314 flat_text = util.ReadFile(input_filename, util.BINARY) | |
| 315 | |
| 316 # Check conditional elements, remove unsatisfied ones from the file. We do | |
| 317 # this twice. The first pass is so that we don't even bother calling | |
| 318 # InlineScript, InlineCSSFile and InlineIncludeFiles on text we're eventually | |
| 319 # going to throw out anyway. | |
| 320 flat_text = CheckConditionalElements(flat_text) | |
| 321 | |
| 322 if not allow_external_script: | |
| 323 # We need to inline css and js before we inline images so that image | |
| 324 # references gets inlined in the css and js | |
| 325 flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' + | |
| 326 '(?P<attrs2>.*?)></script>', | |
| 327 InlineScript, | |
| 328 flat_text) | |
| 329 | |
| 330 flat_text = _STYLESHEET_RE.sub( | |
| 331 lambda m: InlineCSSFile(m, '<style>%s</style>'), | |
| 332 flat_text) | |
| 333 | |
| 334 flat_text = _INCLUDE_RE.sub(InlineIncludeFiles, flat_text) | |
| 335 | |
| 336 # Check conditional elements, second pass. This catches conditionals in any | |
| 337 # of the text we just inlined. | |
| 338 flat_text = CheckConditionalElements(flat_text) | |
| 339 | |
| 340 # Allow custom modifications before inlining images. | |
| 341 if rewrite_function: | |
| 342 flat_text = rewrite_function(input_filepath, flat_text, distribution) | |
| 343 | |
| 344 flat_text = _SRC_RE.sub(SrcReplace, flat_text) | |
| 345 | |
| 346 # TODO(arv): Only do this inside <style> tags. | |
| 347 flat_text = InlineCSSImages(flat_text) | |
| 348 | |
| 349 flat_text = _ICON_RE.sub(SrcReplace, flat_text) | |
| 350 | |
| 351 if names_only: | |
| 352 flat_text = None # Will contains garbage if the flag is set anyway. | |
| 353 return InlinedData(flat_text, inlined_files) | |
| 354 | |
| 355 | |
| 356 def InlineToString(input_filename, grd_node, allow_external_script=False, | |
| 357 rewrite_function=None, filename_expansion_function=None): | |
| 358 """Inlines the resources in a specified file and returns it as a string. | |
| 359 | |
| 360 Args: | |
| 361 input_filename: name of file to read in | |
| 362 grd_node: html node from the grd file for this include tag | |
| 363 Returns: | |
| 364 the inlined data as a string | |
| 365 """ | |
| 366 try: | |
| 367 return DoInline( | |
| 368 input_filename, | |
| 369 grd_node, | |
| 370 allow_external_script=allow_external_script, | |
| 371 rewrite_function=rewrite_function, | |
| 372 filename_expansion_function=filename_expansion_function).inlined_data | |
| 373 except IOError, e: | |
| 374 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % | |
| 375 (e.filename, input_filename, e.strerror)) | |
| 376 | |
| 377 | |
| 378 def InlineToFile(input_filename, output_filename, grd_node): | |
| 379 """Inlines the resources in a specified file and writes it. | |
| 380 | |
| 381 Reads input_filename, finds all the src attributes and attempts to | |
| 382 inline the files they are referring to, then writes the result | |
| 383 to output_filename. | |
| 384 | |
| 385 Args: | |
| 386 input_filename: name of file to read in | |
| 387 output_filename: name of file to be written to | |
| 388 grd_node: html node from the grd file for this include tag | |
| 389 Returns: | |
| 390 a set of filenames of all the inlined files | |
| 391 """ | |
| 392 inlined_data = InlineToString(input_filename, grd_node) | |
| 393 with open(output_filename, 'wb') as out_file: | |
| 394 out_file.writelines(inlined_data) | |
| 395 | |
| 396 | |
| 397 def GetResourceFilenames(filename, | |
| 398 allow_external_script=False, | |
| 399 rewrite_function=None, | |
| 400 filename_expansion_function=None): | |
| 401 """For a grd file, returns a set of all the files that would be inline.""" | |
| 402 try: | |
| 403 return DoInline( | |
| 404 filename, | |
| 405 None, | |
| 406 names_only=True, | |
| 407 allow_external_script=allow_external_script, | |
| 408 rewrite_function=rewrite_function, | |
| 409 filename_expansion_function=filename_expansion_function).inlined_files | |
| 410 except IOError, e: | |
| 411 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % | |
| 412 (e.filename, filename, e.strerror)) | |
| 413 | |
| 414 | |
| 415 def main(): | |
| 416 if len(sys.argv) <= 2: | |
| 417 print "Flattens a HTML file by inlining its external resources.\n" | |
| 418 print "html_inline.py inputfile outputfile" | |
| 419 else: | |
| 420 InlineToFile(sys.argv[1], sys.argv[2], None) | |
| 421 | |
| 422 if __name__ == '__main__': | |
| 423 main() | |
| OLD | NEW |