OLD | NEW |
1 # Copyright 2013 The Chromium Authors. All rights reserved. | 1 # Copyright 2013 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import difflib | 5 import difflib |
6 import hashlib | 6 import hashlib |
| 7 import itertools |
| 8 import json |
7 import os | 9 import os |
8 import re | |
9 import sys | 10 import sys |
| 11 import zipfile |
10 | 12 |
11 | 13 |
12 # When set and a difference is detected, a diff of what changed is printed. | 14 # When set and a difference is detected, a diff of what changed is printed. |
13 _PRINT_MD5_DIFFS = int(os.environ.get('PRINT_MD5_DIFFS', 0)) | 15 _PRINT_MD5_DIFFS = int(os.environ.get('PRINT_MD5_DIFFS', 0)) |
14 | 16 |
15 # Used to strip off temp dir prefix. | |
16 _TEMP_DIR_PATTERN = re.compile(r'^/tmp/.*?/') | |
17 | |
18 | 17 |
19 def CallAndRecordIfStale( | 18 def CallAndRecordIfStale( |
20 function, record_path=None, input_paths=None, input_strings=None, | 19 function, record_path=None, input_paths=None, input_strings=None, |
21 output_paths=None, force=False): | 20 output_paths=None, force=False, pass_changes=False): |
22 """Calls function if outputs are stale. | 21 """Calls function if outputs are stale. |
23 | 22 |
24 Outputs are considered stale if: | 23 Outputs are considered stale if: |
25 - any output_paths are missing, or | 24 - any output_paths are missing, or |
26 - the contents of any file within input_paths has changed, or | 25 - the contents of any file within input_paths has changed, or |
27 - the contents of input_strings has changed. | 26 - the contents of input_strings has changed. |
28 | 27 |
29 To debug which files are out-of-date, set the environment variable: | 28 To debug which files are out-of-date, set the environment variable: |
30 PRINT_MD5_DIFFS=1 | 29 PRINT_MD5_DIFFS=1 |
31 | 30 |
32 Args: | 31 Args: |
33 function: The function to call. | 32 function: The function to call. |
34 record_path: Path to record metadata. | 33 record_path: Path to record metadata. |
35 Defaults to output_paths[0] + '.md5.stamp' | 34 Defaults to output_paths[0] + '.md5.stamp' |
36 input_paths: List of paths to calcualte an md5 sum on. | 35 input_paths: List of paths to calcualte an md5 sum on. |
37 input_strings: List of strings to record verbatim. | 36 input_strings: List of strings to record verbatim. |
38 output_paths: List of output paths. | 37 output_paths: List of output paths. |
39 force: When True, function is always called. | 38 force: Whether to treat outputs as missing regardless of whether they |
| 39 actually are. |
| 40 pass_changes: Whether to pass a Changes instance to |function|. |
40 """ | 41 """ |
41 assert record_path or output_paths | 42 assert record_path or output_paths |
42 input_paths = input_paths or [] | 43 input_paths = input_paths or [] |
43 input_strings = input_strings or [] | 44 input_strings = input_strings or [] |
44 output_paths = output_paths or [] | 45 output_paths = output_paths or [] |
45 record_path = record_path or output_paths[0] + '.md5.stamp' | 46 record_path = record_path or output_paths[0] + '.md5.stamp' |
46 md5_checker = _Md5Checker( | 47 |
47 record_path=record_path, | 48 assert record_path.endswith('.stamp'), ( |
48 input_paths=input_paths, | 49 'record paths must end in \'.stamp\' so that they are easy to find ' |
49 input_strings=input_strings) | 50 'and delete') |
50 | 51 |
51 missing_outputs = [x for x in output_paths if not os.path.exists(x)] | 52 new_metadata = _Metadata() |
52 is_stale = md5_checker.old_digest != md5_checker.new_digest | 53 new_metadata.AddStrings(input_strings) |
53 | 54 |
54 if force or missing_outputs or is_stale: | 55 for path in input_paths: |
55 if _PRINT_MD5_DIFFS: | 56 if _IsZipFile(path): |
56 print '=' * 80 | 57 entries = _ExtractZipEntries(path) |
57 print 'Difference found in %s:' % record_path | 58 new_metadata.AddZipFile(path, entries) |
58 if missing_outputs: | 59 else: |
59 print 'Outputs do not exist:\n' + '\n'.join(missing_outputs) | 60 new_metadata.AddFile(path, _Md5ForPath(path)) |
60 elif force: | 61 |
61 print 'force=True' | 62 old_metadata = None |
62 else: | 63 missing_outputs = [x for x in output_paths if force or not os.path.exists(x)] |
63 print md5_checker.DescribeDifference() | 64 # When outputs are missing, don't bother gathering change information. |
64 print '=' * 80 | 65 if not missing_outputs and os.path.exists(record_path): |
65 function() | 66 with open(record_path, 'r') as jsonfile: |
66 md5_checker.Write() | 67 try: |
| 68 old_metadata = _Metadata.FromFile(jsonfile) |
| 69 except: # pylint: disable=bare-except |
| 70 pass # Not yet using new file format. |
| 71 |
| 72 changes = Changes(old_metadata, new_metadata, force, missing_outputs) |
| 73 if not changes.HasChanges(): |
| 74 return |
| 75 |
| 76 if _PRINT_MD5_DIFFS: |
| 77 print '=' * 80 |
| 78 print 'Target is stale: %s' % record_path |
| 79 print changes.DescribeDifference() |
| 80 print '=' * 80 |
| 81 |
| 82 # Delete the old metdata beforehand since failures leave it in an |
| 83 # inderterminate state. |
| 84 if old_metadata: |
| 85 os.unlink(record_path) |
| 86 |
| 87 args = (changes,) if pass_changes else () |
| 88 function(*args) |
| 89 |
| 90 with open(record_path, 'w') as f: |
| 91 new_metadata.ToFile(f) |
| 92 |
| 93 |
| 94 class Changes(object): |
| 95 """Provides and API for querying what changed between runs.""" |
| 96 |
| 97 def __init__(self, old_metadata, new_metadata, force, missing_outputs): |
| 98 self.old_metadata = old_metadata |
| 99 self.new_metadata = new_metadata |
| 100 self.force = force |
| 101 self.missing_outputs = missing_outputs |
| 102 |
| 103 def _GetOldTag(self, path, subpath=None): |
| 104 return self.old_metadata and self.old_metadata.GetTag(path, subpath) |
| 105 |
| 106 def HasChanges(self): |
| 107 """Returns whether any changes exist.""" |
| 108 return (self.force or |
| 109 not self.old_metadata or |
| 110 self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5() or |
| 111 self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5()) |
| 112 |
| 113 def AddedOrModifiedOnly(self): |
| 114 """Returns whether the only changes were from added or modified (sub)files. |
| 115 |
| 116 No missing outputs, no removed paths/subpaths. |
| 117 """ |
| 118 if (self.force or |
| 119 not self.old_metadata or |
| 120 self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5()): |
| 121 return False |
| 122 if any(self.IterRemovedPaths()): |
| 123 return False |
| 124 for path in self.IterModifiedPaths(): |
| 125 if any(self.IterRemovedSubpaths(path)): |
| 126 return False |
| 127 return True |
| 128 |
| 129 def IterAddedPaths(self): |
| 130 """Generator for paths that were added.""" |
| 131 for path in self.new_metadata.IterPaths(): |
| 132 if self._GetOldTag(path) is None: |
| 133 yield path |
| 134 |
| 135 def IterAddedSubpaths(self, path): |
| 136 """Generator for paths that were added within the given zip file.""" |
| 137 for subpath in self.new_metadata.IterSubpaths(path): |
| 138 if self._GetOldTag(path, subpath) is None: |
| 139 yield subpath |
| 140 |
| 141 def IterRemovedPaths(self): |
| 142 """Generator for paths that were removed.""" |
| 143 if self.old_metadata: |
| 144 for path in self.old_metadata.IterPaths(): |
| 145 if self.new_metadata.GetTag(path) is None: |
| 146 yield path |
| 147 |
| 148 def IterRemovedSubpaths(self, path): |
| 149 """Generator for paths that were removed within the given zip file.""" |
| 150 if self.old_metadata: |
| 151 for subpath in self.old_metadata.IterSubpaths(path): |
| 152 if self.new_metadata.GetTag(path, subpath) is None: |
| 153 yield subpath |
| 154 |
| 155 def IterModifiedPaths(self): |
| 156 """Generator for paths whose contents have changed.""" |
| 157 for path in self.new_metadata.IterPaths(): |
| 158 old_tag = self._GetOldTag(path) |
| 159 new_tag = self.new_metadata.GetTag(path) |
| 160 if old_tag is not None and old_tag != new_tag: |
| 161 yield path |
| 162 |
| 163 def IterModifiedSubpaths(self, path): |
| 164 """Generator for paths within a zip file whose contents have changed.""" |
| 165 for subpath in self.new_metadata.IterSubpaths(path): |
| 166 old_tag = self._GetOldTag(path, subpath) |
| 167 new_tag = self.new_metadata.GetTag(path, subpath) |
| 168 if old_tag is not None and old_tag != new_tag: |
| 169 yield subpath |
| 170 |
| 171 def IterChangedPaths(self): |
| 172 """Generator for all changed paths (added/removed/modified).""" |
| 173 return itertools.chain(self.IterRemovedPaths(), |
| 174 self.IterModifiedPaths(), |
| 175 self.IterAddedPaths()) |
| 176 |
| 177 def IterChangedSubpaths(self, path): |
| 178 """Generator for paths within a zip that were added/removed/modified.""" |
| 179 return itertools.chain(self.IterRemovedSubpaths(path), |
| 180 self.IterModifiedSubpaths(path), |
| 181 self.IterAddedSubpaths(path)) |
| 182 |
| 183 def DescribeDifference(self): |
| 184 """Returns a human-readable description of what changed.""" |
| 185 if self.force: |
| 186 return 'force=True' |
| 187 elif self.missing_outputs: |
| 188 return 'Outputs do not exist:\n ' + '\n '.join(self.missing_outputs) |
| 189 elif self.old_metadata is None: |
| 190 return 'Previous stamp file not found.' |
| 191 |
| 192 if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5(): |
| 193 ndiff = difflib.ndiff(self.old_metadata.GetStrings(), |
| 194 self.new_metadata.GetStrings()) |
| 195 changed = [s for s in ndiff if not s.startswith(' ')] |
| 196 return 'Input strings changed:\n ' + '\n '.join(changed) |
| 197 |
| 198 if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5(): |
| 199 return "There's no difference." |
| 200 |
| 201 lines = [] |
| 202 lines.extend('Added: ' + p for p in self.IterAddedPaths()) |
| 203 lines.extend('Removed: ' + p for p in self.IterRemovedPaths()) |
| 204 for path in self.IterModifiedPaths(): |
| 205 lines.append('Modified: ' + path) |
| 206 lines.extend(' -> Subpath added: ' + p |
| 207 for p in self.IterAddedSubpaths(path)) |
| 208 lines.extend(' -> Subpath removed: ' + p |
| 209 for p in self.IterRemovedSubpaths(path)) |
| 210 lines.extend(' -> Subpath modified: ' + p |
| 211 for p in self.IterModifiedSubpaths(path)) |
| 212 if lines: |
| 213 return 'Input files changed:\n ' + '\n '.join(lines) |
| 214 return 'I have no idea what changed (there is a bug).' |
| 215 |
| 216 |
| 217 class _Metadata(object): |
| 218 """Data model for tracking change metadata.""" |
| 219 # Schema: |
| 220 # { |
| 221 # "files-md5": "VALUE", |
| 222 # "strings-md5": "VALUE", |
| 223 # "input-files": [ |
| 224 # { |
| 225 # "path": "path.jar", |
| 226 # "tag": "{MD5 of entries}", |
| 227 # "entries": [ |
| 228 # { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ... |
| 229 # ] |
| 230 # }, { |
| 231 # "path": "path.txt", |
| 232 # "tag": "{MD5}", |
| 233 # } |
| 234 # ], |
| 235 # "input-strings": ["a", "b", ...], |
| 236 # } |
| 237 def __init__(self): |
| 238 self._files_md5 = None |
| 239 self._strings_md5 = None |
| 240 self._files = [] |
| 241 self._strings = [] |
| 242 # Map of (path, subpath) -> entry. Created upon first call to _GetEntry(). |
| 243 self._file_map = None |
| 244 |
| 245 @classmethod |
| 246 def FromFile(cls, fileobj): |
| 247 """Returns a _Metadata initialized from a file object.""" |
| 248 ret = cls() |
| 249 obj = json.load(fileobj) |
| 250 ret._files_md5 = obj['files-md5'] |
| 251 ret._strings_md5 = obj['strings-md5'] |
| 252 ret._files = obj['input-files'] |
| 253 ret._strings = obj['input-strings'] |
| 254 return ret |
| 255 |
| 256 def ToFile(self, fileobj): |
| 257 """Serializes metadata to the given file object.""" |
| 258 obj = { |
| 259 "files-md5": self.FilesMd5(), |
| 260 "strings-md5": self.StringsMd5(), |
| 261 "input-files": self._files, |
| 262 "input-strings": self._strings, |
| 263 } |
| 264 json.dump(obj, fileobj, indent=2) |
| 265 |
| 266 def _AssertNotQueried(self): |
| 267 assert self._files_md5 is None |
| 268 assert self._strings_md5 is None |
| 269 assert self._file_map is None |
| 270 |
| 271 def AddStrings(self, values): |
| 272 self._AssertNotQueried() |
| 273 self._strings.extend(str(v) for v in values) |
| 274 |
| 275 def AddFile(self, path, tag): |
| 276 """Adds metadata for a non-zip file. |
| 277 |
| 278 Args: |
| 279 path: Path to the file. |
| 280 tag: A short string representative of the file contents. |
| 281 """ |
| 282 self._AssertNotQueried() |
| 283 self._files.append({ |
| 284 'path': path, |
| 285 'tag': tag, |
| 286 }) |
| 287 |
| 288 def AddZipFile(self, path, entries): |
| 289 """Adds metadata for a zip file. |
| 290 |
| 291 Args: |
| 292 path: Path to the file. |
| 293 entries: List of (subpath, tag) tuples for entries within the zip. |
| 294 """ |
| 295 self._AssertNotQueried() |
| 296 tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries), |
| 297 (e[1] for e in entries))) |
| 298 self._files.append({ |
| 299 'path': path, |
| 300 'tag': tag, |
| 301 'entries': [{"path": e[0], "tag": e[1]} for e in entries], |
| 302 }) |
| 303 |
| 304 def GetStrings(self): |
| 305 """Returns the list of input strings.""" |
| 306 return self._strings |
| 307 |
| 308 def FilesMd5(self): |
| 309 """Lazily computes and returns the aggregate md5 of input files.""" |
| 310 if self._files_md5 is None: |
| 311 # Omit paths from md5 since temporary files have random names. |
| 312 self._files_md5 = _ComputeInlineMd5( |
| 313 self.GetTag(p) for p in sorted(self.IterPaths())) |
| 314 return self._files_md5 |
| 315 |
| 316 def StringsMd5(self): |
| 317 """Lazily computes and returns the aggregate md5 of input strings.""" |
| 318 if self._strings_md5 is None: |
| 319 self._strings_md5 = _ComputeInlineMd5(self._strings) |
| 320 return self._strings_md5 |
| 321 |
| 322 def _GetEntry(self, path, subpath=None): |
| 323 """Returns the JSON entry for the given path / subpath.""" |
| 324 if self._file_map is None: |
| 325 self._file_map = {} |
| 326 for entry in self._files: |
| 327 self._file_map[(entry['path'], None)] = entry |
| 328 for subentry in entry.get('entries', ()): |
| 329 self._file_map[(entry['path'], subentry['path'])] = subentry |
| 330 return self._file_map.get((path, subpath)) |
| 331 |
| 332 def GetTag(self, path, subpath=None): |
| 333 """Returns the tag for the given path / subpath.""" |
| 334 ret = self._GetEntry(path, subpath) |
| 335 return ret and ret['tag'] |
| 336 |
| 337 def IterPaths(self): |
| 338 """Returns a generator for all top-level paths.""" |
| 339 return (e['path'] for e in self._files) |
| 340 |
| 341 def IterSubpaths(self, path): |
| 342 """Returns a generator for all subpaths in the given zip. |
| 343 |
| 344 If the given path is not a zip file, returns an empty generator. |
| 345 """ |
| 346 outer_entry = self._GetEntry(path) |
| 347 subentries = outer_entry.get('entries', []) |
| 348 return (entry['path'] for entry in subentries) |
67 | 349 |
68 | 350 |
69 def _UpdateMd5ForFile(md5, path, block_size=2**16): | 351 def _UpdateMd5ForFile(md5, path, block_size=2**16): |
70 with open(path, 'rb') as infile: | 352 with open(path, 'rb') as infile: |
71 while True: | 353 while True: |
72 data = infile.read(block_size) | 354 data = infile.read(block_size) |
73 if not data: | 355 if not data: |
74 break | 356 break |
75 md5.update(data) | 357 md5.update(data) |
76 | 358 |
77 | 359 |
78 def _UpdateMd5ForDirectory(md5, dir_path): | 360 def _UpdateMd5ForDirectory(md5, dir_path): |
79 for root, _, files in os.walk(dir_path): | 361 for root, _, files in os.walk(dir_path): |
80 for f in files: | 362 for f in files: |
81 _UpdateMd5ForFile(md5, os.path.join(root, f)) | 363 _UpdateMd5ForFile(md5, os.path.join(root, f)) |
82 | 364 |
83 | 365 |
84 def _UpdateMd5ForPath(md5, path): | 366 def _Md5ForPath(path): |
| 367 md5 = hashlib.md5() |
85 if os.path.isdir(path): | 368 if os.path.isdir(path): |
86 _UpdateMd5ForDirectory(md5, path) | 369 _UpdateMd5ForDirectory(md5, path) |
87 else: | 370 else: |
88 _UpdateMd5ForFile(md5, path) | 371 _UpdateMd5ForFile(md5, path) |
| 372 return md5.hexdigest() |
89 | 373 |
90 | 374 |
91 def _TrimPathPrefix(path): | 375 def _ComputeInlineMd5(iterable): |
92 """Attempts to remove temp dir prefix from the path. | 376 """Computes the md5 of the concatenated parameters.""" |
93 | 377 md5 = hashlib.md5() |
94 Use this only for extended_info (not for the actual md5). | 378 for item in iterable: |
95 """ | 379 md5.update(str(item)) |
96 return _TEMP_DIR_PATTERN.sub('{TMP}', path) | 380 return md5.hexdigest() |
97 | 381 |
98 | 382 |
99 class _Md5Checker(object): | 383 def _IsZipFile(path): |
100 def __init__(self, record_path=None, input_paths=None, input_strings=None): | 384 """Returns whether to treat the given file as a zip file.""" |
101 if not input_paths: | 385 # ijar doesn't set the CRC32 field. |
102 input_paths = [] | 386 if path.endswith('.interface.jar'): |
103 if not input_strings: | 387 return False |
104 input_strings = [] | 388 return path[-4:] in ('.zip', '.apk', '.jar') or path.endswith('.srcjar') |
105 | 389 |
106 assert record_path.endswith('.stamp'), ( | |
107 'record paths must end in \'.stamp\' so that they are easy to find ' | |
108 'and delete') | |
109 | 390 |
110 self.record_path = record_path | 391 def _ExtractZipEntries(path): |
111 | 392 """Returns a list of (path, CRC32) of all files within |path|.""" |
112 extended_info = [] | 393 entries = [] |
113 outer_md5 = hashlib.md5() | 394 with zipfile.ZipFile(path) as zip_file: |
114 for i in sorted(input_paths): | 395 for zip_info in zip_file.infolist(): |
115 inner_md5 = hashlib.md5() | 396 # Skip directories and empty files. |
116 _UpdateMd5ForPath(inner_md5, i) | 397 if zip_info.CRC: |
117 i = _TrimPathPrefix(i) | 398 entries.append((zip_info.filename, zip_info.CRC)) |
118 extended_info.append(i + '=' + inner_md5.hexdigest()) | 399 return entries |
119 # Include the digest in the overall diff, but not the path | |
120 outer_md5.update(inner_md5.hexdigest()) | |
121 | |
122 for s in (str(s) for s in input_strings): | |
123 outer_md5.update(s) | |
124 extended_info.append(s) | |
125 | |
126 self.new_digest = outer_md5.hexdigest() | |
127 self.new_extended_info = extended_info | |
128 | |
129 self.old_digest = '' | |
130 self.old_extended_info = [] | |
131 if os.path.exists(self.record_path): | |
132 with open(self.record_path, 'r') as old_record: | |
133 self.old_extended_info = [line.strip() for line in old_record] | |
134 if self.old_extended_info: | |
135 self.old_digest = self.old_extended_info.pop(0) | |
136 | |
137 def Write(self): | |
138 with open(self.record_path, 'w') as new_record: | |
139 new_record.write(self.new_digest) | |
140 new_record.write('\n' + '\n'.join(self.new_extended_info) + '\n') | |
141 | |
142 def DescribeDifference(self): | |
143 if self.old_digest == self.new_digest: | |
144 return "There's no difference." | |
145 if not self.old_digest: | |
146 return 'Previous stamp file not found.' | |
147 if not self.old_extended_info: | |
148 return 'Previous stamp file lacks extended info.' | |
149 diff = difflib.unified_diff(self.old_extended_info, self.new_extended_info) | |
150 return '\n'.join(diff) | |
OLD | NEW |