OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/python2 |
| 2 |
| 3 # Copyright 2014 Google Inc. |
| 4 # |
| 5 # Use of this source code is governed by a BSD-style license that can be |
| 6 # found in the LICENSE file. |
| 7 |
| 8 """Skia's Chromium Codereview Comparison Script. |
| 9 |
| 10 This script takes two Codereview URLs, looks at the trybot results for |
| 11 the two codereviews and compares the results. |
| 12 |
| 13 Usage: |
| 14 compare_codereview.py CONTROL_URL ROLL_URL |
| 15 """ |
| 16 |
| 17 import collections |
| 18 import os |
| 19 import re |
| 20 import sys |
| 21 import urllib2 |
| 22 import HTMLParser |
| 23 |
| 24 |
| 25 class CodeReviewHTMLParser(HTMLParser.HTMLParser): |
| 26 """Parses CodeReview web page. |
| 27 |
| 28 Use the CodeReviewHTMLParser.parse static function to make use of |
| 29 this class. |
| 30 |
| 31 This uses the HTMLParser class because it's the best thing in |
| 32 Python's standard library. We need a little more power than a |
| 33 regex. [Search for "You can't parse [X]HTML with regex." for more |
| 34 information. |
| 35 """ |
| 36 # pylint: disable=I0011,R0904 |
| 37 @staticmethod |
| 38 def parse(url): |
| 39 """Parses a CodeReview web pages. |
| 40 |
| 41 Args: |
| 42 url (string), a codereview URL like this: |
| 43 'https://codereview.chromium.org/?????????'. |
| 44 |
| 45 Returns: |
| 46 A dictionary; the keys are bot_name strings, the values |
| 47 are CodeReviewHTMLParser.Status objects |
| 48 """ |
| 49 parser = CodeReviewHTMLParser() |
| 50 try: |
| 51 parser.feed(urllib2.urlopen(url).read()) |
| 52 except (urllib2.URLError,): |
| 53 print >> sys.stderr, 'Error getting', url |
| 54 return None |
| 55 parser.close() |
| 56 return parser.statuses |
| 57 |
| 58 # namedtuples are like lightweight structs in Python. The low |
| 59 # overhead of a tuple, but the ease of use of an object. |
| 60 Status = collections.namedtuple('Status', ['status', 'url']) |
| 61 |
| 62 def __init__(self): |
| 63 HTMLParser.HTMLParser.__init__(self) |
| 64 self._id = None |
| 65 self._status = None |
| 66 self._href = None |
| 67 self._anchor_data = '' |
| 68 self._currently_parsing_trybotdiv = False |
| 69 # statuses is a dictionary of CodeReviewHTMLParser.Status |
| 70 self.statuses = {} |
| 71 |
| 72 def handle_starttag(self, tag, attrs): |
| 73 """Overrides the HTMLParser method to implement functionality. |
| 74 |
| 75 [[begin standard library documentation]] |
| 76 This method is called to handle the start of a tag |
| 77 (e.g. <div id="main">). |
| 78 |
| 79 The tag argument is the name of the tag converted to lower |
| 80 case. The attrs argument is a list of (name, value) pairs |
| 81 containing the attributes found inside the tag's <> |
| 82 brackets. The name will be translated to lower case, and |
| 83 quotes in the value have been removed, and character and |
| 84 entity references have been replaced. |
| 85 |
| 86 For instance, for the tag <A HREF="http://www.cwi.nl/">, this |
| 87 method would be called as handle_starttag('a', [('href', |
| 88 'http://www.cwi.nl/')]). |
| 89 [[end standard library documentation]] |
| 90 """ |
| 91 attrs = dict(attrs) |
| 92 if tag == 'div': |
| 93 # We are looking for <div id="tryjobdiv*">. |
| 94 id_attr = attrs.get('id','') |
| 95 if id_attr.startswith('tryjobdiv'): |
| 96 self._id = id_attr |
| 97 if (self._id and tag == 'a' |
| 98 and 'build-result' in attrs.get('class', '').split()): |
| 99 # If we are already inside a <div id="tryjobdiv*">, we |
| 100 # look for a link if the form |
| 101 # <a class="build-result" href="*">. Then we save the |
| 102 # (non-standard) status attribute and the URL. |
| 103 self._status = attrs.get('status') |
| 104 self._href = attrs.get('href') |
| 105 self._currently_parsing_trybotdiv = True |
| 106 # Start saving anchor data. |
| 107 |
| 108 def handle_data(self, data): |
| 109 """Overrides the HTMLParser method to implement functionality. |
| 110 |
| 111 [[begin standard library documentation]] |
| 112 This method is called to process arbitrary data (e.g. text |
| 113 nodes and the content of <script>...</script> and |
| 114 <style>...</style>). |
| 115 [[end standard library documentation]] |
| 116 """ |
| 117 # Save the text inside the <a></a> tags. Assume <a> tags |
| 118 # aren't nested. |
| 119 if self._currently_parsing_trybotdiv: |
| 120 self._anchor_data += data |
| 121 |
| 122 def handle_endtag(self, tag): |
| 123 """Overrides the HTMLParser method to implement functionality. |
| 124 |
| 125 [[begin standard library documentation]] |
| 126 This method is called to handle the end tag of an element |
| 127 (e.g. </div>). The tag argument is the name of the tag |
| 128 converted to lower case. |
| 129 [[end standard library documentation]] |
| 130 """ |
| 131 if tag == 'a' and self._status: |
| 132 # We take the accumulated self._anchor_data and save it as |
| 133 # the bot name. |
| 134 bot = self._anchor_data.strip() |
| 135 stat = CodeReviewHTMLParser.Status(status=self._status, |
| 136 url=self._href) |
| 137 if bot: |
| 138 # Add to accumulating dictionary. |
| 139 self.statuses[bot] = stat |
| 140 # Reset state to search for the next bot. |
| 141 self._currently_parsing_trybotdiv = False |
| 142 self._anchor_data = '' |
| 143 self._status = None |
| 144 self._href = None |
| 145 |
| 146 |
| 147 class BuilderHTMLParser(HTMLParser.HTMLParser): |
| 148 """parses Trybot web pages. |
| 149 |
| 150 Use the BuilderHTMLParser.parse static function to make use of |
| 151 this class. |
| 152 |
| 153 This uses the HTMLParser class because it's the best thing in |
| 154 Python's standard library. We need a little more power than a |
| 155 regex. [Search for "You can't parse [X]HTML with regex." for more |
| 156 information. |
| 157 """ |
| 158 # pylint: disable=I0011,R0904 |
| 159 @staticmethod |
| 160 def parse(url): |
| 161 """Parses a Trybot web page. |
| 162 |
| 163 Args: |
| 164 url (string), a trybot result URL. |
| 165 |
| 166 Returns: |
| 167 An array of BuilderHTMLParser.Results, each a description |
| 168 of failure results, along with an optional url |
| 169 """ |
| 170 parser = BuilderHTMLParser() |
| 171 try: |
| 172 parser.feed(urllib2.urlopen(url).read()) |
| 173 except (urllib2.URLError,): |
| 174 print >> sys.stderr, 'Error getting', url |
| 175 return [] |
| 176 parser.close() |
| 177 return parser.failure_results |
| 178 |
| 179 Result = collections.namedtuple('Result', ['text', 'url']) |
| 180 |
| 181 def __init__(self): |
| 182 HTMLParser.HTMLParser.__init__(self) |
| 183 self.failure_results = [] |
| 184 self._current_failure_result = None |
| 185 self._divlevel = None |
| 186 self._li_level = 0 |
| 187 self._li_data = '' |
| 188 self._current_failure = False |
| 189 self._failure_results_url = '' |
| 190 |
| 191 def handle_starttag(self, tag, attrs): |
| 192 """Overrides the HTMLParser method to implement functionality. |
| 193 |
| 194 [[begin standard library documentation]] |
| 195 This method is called to handle the start of a tag |
| 196 (e.g. <div id="main">). |
| 197 |
| 198 The tag argument is the name of the tag converted to lower |
| 199 case. The attrs argument is a list of (name, value) pairs |
| 200 containing the attributes found inside the tag's <> |
| 201 brackets. The name will be translated to lower case, and |
| 202 quotes in the value have been removed, and character and |
| 203 entity references have been replaced. |
| 204 |
| 205 For instance, for the tag <A HREF="http://www.cwi.nl/">, this |
| 206 method would be called as handle_starttag('a', [('href', |
| 207 'http://www.cwi.nl/')]). |
| 208 [[end standard library documentation]] |
| 209 """ |
| 210 attrs = dict(attrs) |
| 211 if tag == 'li': |
| 212 # <li> tags can be nested. So we have to count the |
| 213 # nest-level for backing out. |
| 214 self._li_level += 1 |
| 215 return |
| 216 if tag == 'div' and attrs.get('class') == 'failure result': |
| 217 # We care about this sort of thing: |
| 218 # <li> |
| 219 # <li> |
| 220 # <li> |
| 221 # <div class="failure result">...</div> |
| 222 # </li> |
| 223 # </li> |
| 224 # We want this text here. |
| 225 # </li> |
| 226 if self._li_level > 0: |
| 227 self._current_failure = True # Tells us to keep text. |
| 228 return |
| 229 |
| 230 if tag == 'a' and self._current_failure: |
| 231 href = attrs.get('href') |
| 232 # Sometimes we want to keep the stdio url. We always |
| 233 # return it, just in case. |
| 234 if href.endswith('/logs/stdio'): |
| 235 self._failure_results_url = href |
| 236 |
| 237 def handle_data(self, data): |
| 238 """Overrides the HTMLParser method to implement functionality. |
| 239 |
| 240 [[begin standard library documentation]] |
| 241 This method is called to process arbitrary data (e.g. text |
| 242 nodes and the content of <script>...</script> and |
| 243 <style>...</style>). |
| 244 [[end standard library documentation]] |
| 245 """ |
| 246 if self._current_failure: |
| 247 self._li_data += data |
| 248 |
| 249 def handle_endtag(self, tag): |
| 250 """Overrides the HTMLParser method to implement functionality. |
| 251 |
| 252 [[begin standard library documentation]] |
| 253 This method is called to handle the end tag of an element |
| 254 (e.g. </div>). The tag argument is the name of the tag |
| 255 converted to lower case. |
| 256 [[end standard library documentation]] |
| 257 """ |
| 258 if tag == 'li': |
| 259 self._li_level -= 1 |
| 260 if 0 == self._li_level: |
| 261 if self._current_failure: |
| 262 result = self._li_data.strip() |
| 263 first = result.split()[0] |
| 264 if first: |
| 265 result = re.sub( |
| 266 r'^%s(\s+%s)+' % (first, first), first, result) |
| 267 # Sometimes, it repeats the same thing |
| 268 # multiple times. |
| 269 result = re.sub(r'unexpected flaky.*', '', result) |
| 270 # Remove some extra unnecessary text. |
| 271 result = re.sub(r'\bpreamble\b', '', result) |
| 272 result = re.sub(r'\bstdio\b', '', result) |
| 273 url = self._failure_results_url |
| 274 self.failure_results.append( |
| 275 BuilderHTMLParser.Result(result, url)) |
| 276 self._current_failure_result = None |
| 277 # Reset the state. |
| 278 self._current_failure = False |
| 279 self._li_data = '' |
| 280 self._failure_results_url = '' |
| 281 |
| 282 |
| 283 def printer(indent, string): |
| 284 """Print indented, wrapped text. |
| 285 """ |
| 286 def wrap_to(line, columns): |
| 287 """Wrap a line to the given number of columns, return a list |
| 288 of strings. |
| 289 """ |
| 290 ret = [] |
| 291 nextline = '' |
| 292 for word in line.split(): |
| 293 if nextline: |
| 294 if len(nextline) + 1 + len(word) > columns: |
| 295 ret.append(nextline) |
| 296 nextline = word |
| 297 else: |
| 298 nextline += (' ' + word) |
| 299 else: |
| 300 nextline = word |
| 301 if nextline: |
| 302 ret.append(nextline) |
| 303 return ret |
| 304 out = sys.stdout |
| 305 spacer = ' ' |
| 306 for line in string.split('\n'): |
| 307 for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))): |
| 308 out.write(spacer * indent) |
| 309 if i > 0: |
| 310 out.write(spacer) |
| 311 out.write(wrapped_line) |
| 312 out.write('\n') |
| 313 out.flush() |
| 314 |
| 315 |
| 316 def main(control_url, roll_url, verbosity=1): |
| 317 """Compare two Codereview URLs |
| 318 |
| 319 Args: |
| 320 control_url, roll_url: (strings) URL of the format |
| 321 https://codereview.chromium.org/????????? |
| 322 |
| 323 verbosity: (int) verbose level. 0, 1, or 2. |
| 324 """ |
| 325 # pylint: disable=I0011,R0914,R0912 |
| 326 control = CodeReviewHTMLParser.parse(control_url) |
| 327 roll = CodeReviewHTMLParser.parse(roll_url) |
| 328 if not (control and roll): |
| 329 return |
| 330 |
| 331 control_name = '[control %s]' % control_url.split('/')[-1] |
| 332 roll_name = '[roll %s]' % roll_url.split('/')[-1] |
| 333 all_bots = set(control) & set(roll) # Set intersection. |
| 334 |
| 335 out = sys.stdout |
| 336 if verbosity > 0: |
| 337 # Print out summary of all of the bots. |
| 338 out.write('%11s %11s %4s %s\n\n' % |
| 339 ('CONTROL', 'ROLL', 'DIFF', 'BOT')) |
| 340 for bot in sorted(all_bots): |
| 341 if control[bot].status != roll[bot].status: |
| 342 diff = '****' |
| 343 elif (control[bot].status != 'success' or |
| 344 roll[bot].status != 'success'): |
| 345 diff = '....' |
| 346 else: |
| 347 diff = '' |
| 348 out.write('%11s %11s %4s %s\n' % ( |
| 349 control[bot].status, roll[bot].status, diff, bot)) |
| 350 out.write('\n') |
| 351 out.flush() |
| 352 |
| 353 for bot in sorted(all_bots): |
| 354 if (roll[bot].status == 'success'): |
| 355 if verbosity > 1: |
| 356 printer(0, '==%s==' % bot) |
| 357 printer(1, 'OK') |
| 358 continue |
| 359 printer(0, '==%s==' % bot) |
| 360 |
| 361 for (status, name, url) in ( |
| 362 (control[bot].status, control_name, control[bot].url), |
| 363 (roll[bot].status, roll_name, roll[bot].url)): |
| 364 |
| 365 if status == 'failure': |
| 366 printer(1, name) |
| 367 results = BuilderHTMLParser.parse(url) |
| 368 for result in results: |
| 369 formatted_result = re.sub( |
| 370 r'(\S*\.html) ', '\n__\g<1>\n', result.text) |
| 371 printer(2, formatted_result) |
| 372 if ('compile' in result.text |
| 373 or '...and more' in result.text): |
| 374 printer(3, re.sub('/[^/]*$', '/', url) + result.url) |
| 375 else: |
| 376 printer(1, name) |
| 377 printer(2, status) |
| 378 out.write('\n') |
| 379 |
| 380 |
| 381 if __name__ == '__main__': |
| 382 if len(sys.argv) < 3: |
| 383 print >> sys.stderr, __doc__ |
| 384 exit(1) |
| 385 main(sys.argv[1], sys.argv[2], |
| 386 int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1))) |
| 387 |
OLD | NEW |