OLD | NEW |
---|---|
1 # Copyright 2013 The Chromium Authors. All rights reserved. | 1 # Copyright 2013 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import difflib | 5 import difflib |
6 import hashlib | 6 import hashlib |
7 import itertools | |
8 import json | |
7 import os | 9 import os |
8 import re | |
9 import sys | 10 import sys |
11 import zipfile | |
10 | 12 |
11 | 13 |
12 # When set and a difference is detected, a diff of what changed is printed. | 14 # When set and a difference is detected, a diff of what changed is printed. |
13 _PRINT_MD5_DIFFS = int(os.environ.get('PRINT_MD5_DIFFS', 0)) | 15 _PRINT_MD5_DIFFS = int(os.environ.get('PRINT_MD5_DIFFS', 0)) |
14 | 16 |
15 # Used to strip off temp dir prefix. | |
16 _TEMP_DIR_PATTERN = re.compile(r'^/tmp/.*?/') | |
17 | |
18 | 17 |
19 def CallAndRecordIfStale( | 18 def CallAndRecordIfStale( |
20 function, record_path=None, input_paths=None, input_strings=None, | 19 function, record_path=None, input_paths=None, input_strings=None, |
21 output_paths=None, force=False): | 20 output_paths=None, force=False, pass_changes=False): |
22 """Calls function if outputs are stale. | 21 """Calls function if outputs are stale. |
23 | 22 |
24 Outputs are considered stale if: | 23 Outputs are considered stale if: |
25 - any output_paths are missing, or | 24 - any output_paths are missing, or |
26 - the contents of any file within input_paths has changed, or | 25 - the contents of any file within input_paths has changed, or |
27 - the contents of input_strings has changed. | 26 - the contents of input_strings has changed. |
28 | 27 |
29 To debug which files are out-of-date, set the environment variable: | 28 To debug which files are out-of-date, set the environment variable: |
30 PRINT_MD5_DIFFS=1 | 29 PRINT_MD5_DIFFS=1 |
31 | 30 |
32 Args: | 31 Args: |
33 function: The function to call. | 32 function: The function to call. |
34 record_path: Path to record metadata. | 33 record_path: Path to record metadata. |
35 Defaults to output_paths[0] + '.md5.stamp' | 34 Defaults to output_paths[0] + '.md5.stamp' |
36 input_paths: List of paths to calcualte an md5 sum on. | 35 input_paths: List of paths to calcualte an md5 sum on. |
37 input_strings: List of strings to record verbatim. | 36 input_strings: List of strings to record verbatim. |
38 output_paths: List of output paths. | 37 output_paths: List of output paths. |
39 force: When True, function is always called. | 38 force: Whether to treat outputs as missing regardless of whether they |
39 actually are. | |
40 pass_changes: Whether to pass a Changes instance to |function|. | |
40 """ | 41 """ |
41 input_paths = input_paths or [] | 42 input_paths = input_paths or [] |
42 input_strings = input_strings or [] | 43 input_strings = input_strings or [] |
43 output_paths = output_paths or [] | 44 output_paths = output_paths or [] |
44 record_path = record_path or output_paths[0] + '.md5.stamp' | 45 record_path = record_path or output_paths[0] + '.md5.stamp' |
45 md5_checker = _Md5Checker( | 46 |
46 record_path=record_path, | 47 assert record_path.endswith('.stamp'), ( |
47 input_paths=input_paths, | 48 'record paths must end in \'.stamp\' so that they are easy to find ' |
48 input_strings=input_strings) | 49 'and delete') |
49 | 50 |
50 missing_outputs = [x for x in output_paths if not os.path.exists(x)] | 51 new_metadata = _Metadata() |
51 is_stale = md5_checker.old_digest != md5_checker.new_digest | 52 new_metadata.AddStrings(input_strings) |
52 | 53 |
53 if force or missing_outputs or is_stale: | 54 for path in input_paths: |
54 if _PRINT_MD5_DIFFS: | 55 if _IsZipFile(path): |
55 print '=' * 80 | 56 entries = _ExtractZipEntries(path) |
56 print 'Difference found in %s:' % record_path | 57 new_metadata.AddZipFile(path, entries) |
57 if missing_outputs: | 58 else: |
58 print 'Outputs do not exist:\n' + '\n'.join(missing_outputs) | 59 new_metadata.AddFile(path, _Md5ForPath(path)) |
59 elif force: | 60 |
60 print 'force=True' | 61 old_metadata = None |
61 else: | 62 missing_outputs = [x for x in output_paths if force or not os.path.exists(x)] |
62 print md5_checker.DescribeDifference() | 63 # When outputs are missing, don't bother gathering change information. |
63 print '=' * 80 | 64 if not missing_outputs and os.path.exists(record_path): |
64 function() | 65 with open(record_path, 'r') as jsonfile: |
65 md5_checker.Write() | 66 try: |
67 old_metadata = _Metadata.FromFile(jsonfile) | |
68 except: # pylint: disable=bare-except | |
69 pass # Not yet using new file format. | |
70 | |
71 changes = Changes(old_metadata, new_metadata, force, missing_outputs) | |
72 if not changes.HasChanges(): | |
73 return | |
74 | |
75 if _PRINT_MD5_DIFFS: | |
76 print '=' * 80 | |
77 print 'Target is stale: %s' % record_path | |
78 print changes.DescribeDifference() | |
79 print '=' * 80 | |
80 | |
81 # Delete the old metdata beforehand since failures leave it in an | |
82 # inderterminate state. | |
83 if old_metadata: | |
84 os.unlink(record_path) | |
85 | |
86 args = (changes,) if pass_changes else () | |
87 function(*args) | |
88 | |
89 with open(record_path, 'w') as f: | |
90 new_metadata.Write(f) | |
91 | |
92 | |
93 class Changes(object): | |
94 """Provides and API for querying what changed between runs.""" | |
95 | |
96 def __init__(self, old_metadata, new_metadata, force, missing_outputs): | |
97 self.old_metadata = old_metadata | |
98 self.new_metadata = new_metadata | |
99 self.force = force | |
100 self.missing_outputs = missing_outputs | |
101 | |
102 def _GetOldTag(self, path, subpath=None): | |
103 return self.old_metadata and self.old_metadata.GetTag(path, subpath) | |
104 | |
105 def HasChanges(self): | |
106 """Returns whether any changes exist.""" | |
107 return (self.force or | |
108 self.old_metadata is None or | |
jbudorick
2015/09/23 00:26:20
not self.old_metadata?
agrieve
2015/09/23 02:07:56
Done.
| |
109 self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5() or | |
110 self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5()) | |
111 | |
112 def AddedOrModifiedOnly(self): | |
113 """Returns whether the only changes were from added or modified (sub)files. | |
114 | |
115 No missing outputs, no removed paths/subpaths. | |
116 """ | |
117 if (self.force or | |
118 not self.old_metadata or | |
119 self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5()): | |
120 return False | |
121 if any(self.IterRemovedPaths()): | |
122 return False | |
123 for path in self.IterModifiedPaths(): | |
124 if any(self.IterRemovedSubpaths(path)): | |
125 return False | |
126 return True | |
127 | |
128 def IterAddedPaths(self): | |
129 """Generator for paths that were added.""" | |
130 for path in self.new_metadata.IterPaths(): | |
131 if self._GetOldTag(path) is None: | |
132 yield path | |
133 | |
134 def IterAddedSubpaths(self, path): | |
135 """Generator for paths that were added within the given zip file.""" | |
136 for subpath in self.new_metadata.IterSubpaths(path): | |
137 if self._GetOldTag(path, subpath) is None: | |
138 yield subpath | |
139 | |
140 def IterRemovedPaths(self): | |
141 """Generator for paths that were removed.""" | |
142 if self.old_metadata: | |
143 for path in self.old_metadata.IterPaths(): | |
144 if self.new_metadata.GetTag(path) is None: | |
145 yield path | |
146 | |
147 def IterRemovedSubpaths(self, path): | |
148 """Generator for paths that were removed within the given zip file.""" | |
149 if self.old_metadata: | |
150 for subpath in self.old_metadata.IterSubpaths(path): | |
151 if self.new_metadata.GetTag(path, subpath) is None: | |
152 yield subpath | |
153 | |
154 def IterModifiedPaths(self): | |
155 """Generator for paths whose contents have changed.""" | |
156 for path in self.new_metadata.IterPaths(): | |
157 old_tag = self._GetOldTag(path) | |
158 new_tag = self.new_metadata.GetTag(path) | |
159 if old_tag is not None and old_tag != new_tag: | |
160 yield path | |
161 | |
162 def IterModifiedSubpaths(self, path): | |
163 """Generator for paths within a zip file whose contents have changed.""" | |
164 for subpath in self.new_metadata.IterSubpaths(path): | |
165 old_tag = self._GetOldTag(path, subpath) | |
166 new_tag = self.new_metadata.GetTag(path, subpath) | |
167 if old_tag is not None and old_tag != new_tag: | |
168 yield subpath | |
169 | |
170 def IterChangedPaths(self, path): | |
171 """Generator for all changed paths (added/removed/modified).""" | |
172 return itertools.chain(self.IterRemovedPaths(path), | |
173 self.IterModifiedPaths(path), | |
174 self.IterAddedPaths(path)) | |
175 | |
176 def IterChangedSubpaths(self, path): | |
177 """Generator for paths within a zip that were added/removed/modified.""" | |
178 return itertools.chain(self.IterRemovedSubpaths(path), | |
179 self.IterModifiedSubpaths(path), | |
180 self.IterAddedSubpaths(path)) | |
181 | |
182 def DescribeDifference(self): | |
183 """Returns a human-readable description of what changed.""" | |
184 if self.force: | |
185 return 'force=True' | |
186 elif self.missing_outputs: | |
187 return 'Outputs do not exist:\n ' + '\n '.join(self.missing_outputs) | |
188 elif self.old_metadata is None: | |
189 return 'Previous stamp file not found.' | |
190 | |
191 if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5(): | |
192 ndiff = difflib.ndiff(self.old_metadata.GetStrings(), | |
193 self.new_metadata.GetStrings()) | |
194 changed = [s for s in ndiff if not s.startswith(' ')] | |
195 return 'Input strings changed:\n ' + '\n '.join(changed) | |
196 | |
197 if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5(): | |
198 return "There's no difference." | |
199 | |
200 lines = [] | |
201 lines.extend('Added: ' + p for p in self.IterAddedPaths()) | |
202 lines.extend('Removed: ' + p for p in self.IterRemovedPaths()) | |
203 for path in self.IterModifiedPaths(): | |
204 lines.append('Modified: ' + path) | |
205 lines.extend(' -> Subpath added: ' + p | |
206 for p in self.IterAddedSubpaths(path)) | |
207 lines.extend(' -> Subpath removed: ' + p | |
208 for p in self.IterRemovedSubpaths(path)) | |
209 lines.extend(' -> Subpath modified: ' + p | |
210 for p in self.IterModifiedSubpaths(path)) | |
211 if lines: | |
212 return 'Input files changed:\n ' + '\n '.join(lines) | |
213 return 'I have no idea what changed (there is a bug).' | |
jbudorick
2015/09/23 00:26:20
hahaha
| |
214 | |
215 | |
216 class _Metadata(object): | |
217 """Data model for tracking change metadata.""" | |
218 # Schema: | |
219 # { | |
220 # "files-md5": "VALUE", | |
221 # "strings-md5": "VALUE", | |
222 # "input-files": [ | |
223 # { | |
224 # "path": "path.jar", | |
225 # "tag": "{MD5 of entries}", | |
226 # "entries": [ | |
227 # { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ... | |
228 # ] | |
229 # }, { | |
230 # "path": "path.txt", | |
231 # "tag": "{MD5}", | |
jbudorick
2015/09/23 00:26:20
nit: missing closing }
agrieve
2015/09/23 02:07:56
Done.
| |
232 # ], | |
233 # "input-strings": ["a", "b", ...], | |
234 # } | |
235 def __init__(self): | |
236 self._files_md5 = None | |
237 self._strings_md5 = None | |
238 self._files = [] | |
239 self._strings = [] | |
240 # Map of (path, subpath) -> entry. Created upon first call to _GetEntry(). | |
241 self._file_map = None | |
242 | |
243 @classmethod | |
244 def FromFile(cls, fileobj): | |
jbudorick
2015/09/23 00:26:20
not Read/Write or FromFile/ToFile? :(
agrieve
2015/09/23 02:07:56
Done.
| |
245 """Returns a _Metadata initialized from a file object.""" | |
246 ret = cls() | |
247 obj = json.load(fileobj) | |
248 ret._files_md5 = obj['files-md5'] | |
249 ret._strings_md5 = obj['strings-md5'] | |
250 ret._files = obj['input-files'] | |
251 ret._strings = obj['input-strings'] | |
252 return ret | |
253 | |
254 def Write(self, fileobj): | |
255 """Serializes metadata to the given file object.""" | |
256 obj = { | |
257 "files-md5": self.FilesMd5(), | |
258 "strings-md5": self.StringsMd5(), | |
259 "input-files": self._files, | |
260 "input-strings": self._strings, | |
261 } | |
262 json.dump(obj, fileobj, indent=2) | |
263 | |
264 def _AssertNotQueried(self): | |
265 assert self._files_md5 is None | |
266 assert self._strings_md5 is None | |
267 assert self._file_map is None | |
268 | |
269 def AddStrings(self, values): | |
270 self._AssertNotQueried() | |
271 self._strings.extend(str(v) for v in values) | |
272 | |
273 def AddFile(self, path, tag): | |
274 """Adds metadata for a non-zip file. | |
275 | |
276 Args: | |
277 path: Path to the file. | |
278 tag: A short string representative of the file contents. | |
279 """ | |
280 self._AssertNotQueried() | |
281 self._files.append({ | |
282 'path': path, | |
283 'tag': tag, | |
284 }) | |
285 | |
286 def AddZipFile(self, path, entries): | |
287 """Adds metadata for a zip file. | |
288 | |
289 Args: | |
290 path: Path to the file. | |
291 entries: List of (subpath, tag) tuples for entries within the zip. | |
292 """ | |
293 self._AssertNotQueried() | |
294 tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries), | |
295 (e[1] for e in entries))) | |
296 self._files.append({ | |
297 'path': path, | |
298 'tag': tag, | |
299 'entries': [{"path": e[0], "tag": e[1]} for e in entries], | |
300 }) | |
301 | |
302 def GetStrings(self): | |
303 """Returns the list of input strings.""" | |
304 return self._strings | |
305 | |
306 def FilesMd5(self): | |
307 """Lazily computes and returns the aggregate md5 of input files.""" | |
308 if self._files_md5 is None: | |
309 # Omit paths from md5 since temporary files have random names. | |
310 self._files_md5 = _ComputeInlineMd5( | |
311 self.GetTag(p) for p in sorted(self.IterPaths())) | |
312 return self._files_md5 | |
313 | |
314 def StringsMd5(self): | |
315 """Lazily computes and returns the aggregate md5 of input strings.""" | |
316 if self._strings_md5 is None: | |
317 self._strings_md5 = _ComputeInlineMd5(self._strings) | |
318 return self._strings_md5 | |
319 | |
320 def _GetEntry(self, path, subpath=None): | |
321 """Returns the JSON entry for the given path / subpath.""" | |
322 if self._file_map is None: | |
323 self._file_map = {} | |
324 for entry in self._files: | |
325 self._file_map[(entry['path'], None)] = entry | |
326 for subentry in entry.get('entries', ()): | |
327 self._file_map[(entry['path'], subentry['path'])] = subentry | |
328 return self._file_map.get((path, subpath)) | |
329 | |
330 def GetTag(self, path, subpath=None): | |
331 """Returns the tag for the given path / subpath.""" | |
332 ret = self._GetEntry(path, subpath) | |
333 return ret and ret['tag'] | |
334 | |
335 def IterPaths(self): | |
336 """Returns a generator for all top-level paths.""" | |
337 return (e['path'] for e in self._files) | |
338 | |
339 def IterSubpaths(self, path): | |
340 """Returns a generator for all subpaths in the given zip. | |
341 | |
342 If the given path is not a zip file, returns an empty generator. | |
343 """ | |
344 outer_entry = self._GetEntry(path) | |
345 subentries = outer_entry.get('entries', []) | |
346 return (entry['path'] for entry in subentries) | |
66 | 347 |
67 | 348 |
68 def _UpdateMd5ForFile(md5, path, block_size=2**16): | 349 def _UpdateMd5ForFile(md5, path, block_size=2**16): |
69 with open(path, 'rb') as infile: | 350 with open(path, 'rb') as infile: |
70 while True: | 351 while True: |
71 data = infile.read(block_size) | 352 data = infile.read(block_size) |
72 if not data: | 353 if not data: |
73 break | 354 break |
74 md5.update(data) | 355 md5.update(data) |
75 | 356 |
76 | 357 |
77 def _UpdateMd5ForDirectory(md5, dir_path): | 358 def _UpdateMd5ForDirectory(md5, dir_path): |
78 for root, _, files in os.walk(dir_path): | 359 for root, _, files in os.walk(dir_path): |
79 for f in files: | 360 for f in files: |
80 _UpdateMd5ForFile(md5, os.path.join(root, f)) | 361 _UpdateMd5ForFile(md5, os.path.join(root, f)) |
81 | 362 |
82 | 363 |
83 def _UpdateMd5ForPath(md5, path): | 364 def _Md5ForPath(path): |
365 md5 = hashlib.md5() | |
84 if os.path.isdir(path): | 366 if os.path.isdir(path): |
85 _UpdateMd5ForDirectory(md5, path) | 367 _UpdateMd5ForDirectory(md5, path) |
86 else: | 368 else: |
87 _UpdateMd5ForFile(md5, path) | 369 _UpdateMd5ForFile(md5, path) |
370 return md5.hexdigest() | |
88 | 371 |
89 | 372 |
90 def _TrimPathPrefix(path): | 373 def _ComputeInlineMd5(iterable): |
91 """Attempts to remove temp dir prefix from the path. | 374 """Computes the md5 of the concatenated parameters.""" |
92 | 375 md5 = hashlib.md5() |
93 Use this only for extended_info (not for the actual md5). | 376 for item in iterable: |
94 """ | 377 md5.update(str(item)) |
95 return _TEMP_DIR_PATTERN.sub('{TMP}', path) | 378 return md5.hexdigest() |
96 | 379 |
97 | 380 |
98 class _Md5Checker(object): | 381 def _IsZipFile(path): |
99 def __init__(self, record_path=None, input_paths=None, input_strings=None): | 382 """Returns whether to treat the given file as a zip file.""" |
100 if not input_paths: | 383 # ijar doesn't set the CRC32 field. |
101 input_paths = [] | 384 if path.endswith('.interface.jar'): |
102 if not input_strings: | 385 return False |
103 input_strings = [] | 386 return path.endswith('.zip') or path.endswith('.apk') or path.endswith('.jar') |
104 | 387 |
105 assert record_path.endswith('.stamp'), ( | |
106 'record paths must end in \'.stamp\' so that they are easy to find ' | |
107 'and delete') | |
108 | 388 |
109 self.record_path = record_path | 389 def _ExtractZipEntries(path): |
110 | 390 """Returns a list of (path, CRC32) of all files within |path|.""" |
111 extended_info = [] | 391 entries = [] |
112 outer_md5 = hashlib.md5() | 392 with zipfile.ZipFile(path) as zip_file: |
113 for i in sorted(input_paths): | 393 for zip_info in zip_file.infolist(): |
114 inner_md5 = hashlib.md5() | 394 # Skip directories and empty files. |
115 _UpdateMd5ForPath(inner_md5, i) | 395 if zip_info.CRC: |
116 i = _TrimPathPrefix(i) | 396 entries.append((zip_info.filename, zip_info.CRC)) |
117 extended_info.append(i + '=' + inner_md5.hexdigest()) | 397 return entries |
118 # Include the digest in the overall diff, but not the path | |
119 outer_md5.update(inner_md5.hexdigest()) | |
120 | |
121 for s in map(str, input_strings): | |
122 outer_md5.update(s) | |
123 extended_info.append(s) | |
124 | |
125 self.new_digest = outer_md5.hexdigest() | |
126 self.new_extended_info = extended_info | |
127 | |
128 self.old_digest = '' | |
129 self.old_extended_info = [] | |
130 if os.path.exists(self.record_path): | |
131 with open(self.record_path, 'r') as old_record: | |
132 self.old_extended_info = [line.strip() for line in old_record] | |
133 if self.old_extended_info: | |
134 self.old_digest = self.old_extended_info.pop(0) | |
135 | |
136 def Write(self): | |
137 with open(self.record_path, 'w') as new_record: | |
138 new_record.write(self.new_digest) | |
139 new_record.write('\n' + '\n'.join(self.new_extended_info) + '\n') | |
140 | |
141 def DescribeDifference(self): | |
142 if self.old_digest == self.new_digest: | |
143 return "There's no difference." | |
144 if not self.old_digest: | |
145 return 'Previous stamp file not found.' | |
146 if not self.old_extended_info: | |
147 return 'Previous stamp file lacks extended info.' | |
148 diff = difflib.unified_diff(self.old_extended_info, self.new_extended_info) | |
149 return '\n'.join(diff) | |
OLD | NEW |