Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(177)

Side by Side Diff: appengine/chromium_build/app.py

Issue 919733003: Fix another source of utf8 characters being lost. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: rename vars to make clearer Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | appengine/chromium_build/tests/console_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from __future__ import with_statement 5 from __future__ import with_statement
6 6
7 import datetime 7 import datetime
8 import json 8 import json
9 import logging 9 import logging
10 import os 10 import os
(...skipping 257 matching lines...) Expand 10 before | Expand all | Expand 10 after
268 'comment': None, 268 'comment': None,
269 'details': None, 269 'details': None,
270 } 270 }
271 prev_rev_db = get_or_create_row('latest_rev', rev_number) 271 prev_rev_db = get_or_create_row('latest_rev', rev_number)
272 prev_rev_db.fetch_timestamp = datetime.datetime.now() 272 prev_rev_db.fetch_timestamp = datetime.datetime.now()
273 prev_rev_db.rev_number = rev_number 273 prev_rev_db.rev_number = rev_number
274 prev_rev_db.put() 274 prev_rev_db.put()
275 put_data_into_cache('latest_rev', latest_rev_row) 275 put_data_into_cache('latest_rev', latest_rev_row)
276 276
277 277
278 def utf8_convert(beautiful_soup_tag):
279 # cmp also investigated:
280 # beautiful_soup_tag.__str__(encoding='utf-8').decode('utf-8')
281 # He found that the BeautifulSoup() __str__ method when used with a 'utf-8'
282 # encoding returned effectively the same thing as str(), a Python built-in.
283 # After a handful of tests, he switched to using str() to avoid the add'l
284 # complexity of another BeautifulSoup method.
285 return str(beautiful_soup_tag).decode('utf-8')
286
287
278 ########## 288 ##########
279 # ConsoleData class definition and related functions. 289 # ConsoleData class definition and related functions.
280 ########## 290 ##########
281 class ConsoleData(object): 291 class ConsoleData(object):
282 def __init__(self): 292 def __init__(self):
283 self.row_orderedkeys = [] 293 self.row_orderedkeys = []
284 self.row_data = {} 294 self.row_data = {}
285 295
286 # Retain order of observed masters. 296 # Retain order of observed masters.
287 self.masters = [] 297 self.masters = []
288 298
289 # Map(k,v): k=Master, v=List of categories 299 # Map(k,v): k=Master, v=List of categories
290 self.category_order = {} 300 self.category_order = {}
291 # Map(k,v): k=Master, v=Dict of category data 301 # Map(k,v): k=Master, v=Dict of category data
292 self.category_data = {} 302 self.category_data = {}
293 303
294 self.category_count = 0 304 self.category_count = 0
295 self.master = '' 305 self.master = ''
296 self.lastRevisionSeen = None 306 self.lastRevisionSeen = None
297 self.lastMasterSeen = None 307 self.lastMasterSeen = None
298 308
299 @staticmethod 309 @staticmethod
300 def ContentsToHtml(contents): 310 def ContentsToHtml(contents):
301 return ''.join(unicode(content).encode('ascii', 'replace') 311 return ''.join(utf8_convert(content) for content in contents)
302 for content in contents)
cmp 2015/02/12 18:37:16 This caused the UTF-8 characters |content| was hol
303 312
304 @property 313 @property
305 def last_row(self): 314 def last_row(self):
306 return self.row_data[self.lastRevisionSeen] 315 return self.row_data[self.lastRevisionSeen]
307 316
308 def SawMaster(self, master): 317 def SawMaster(self, master):
309 self.lastMasterSeen = master 318 self.lastMasterSeen = master
310 assert(self.lastMasterSeen not in self.category_order) 319 assert(self.lastMasterSeen not in self.category_order)
311 self.masters.append(self.lastMasterSeen) 320 self.masters.append(self.lastMasterSeen)
312 self.category_order.setdefault(self.lastMasterSeen, []) 321 self.category_order.setdefault(self.lastMasterSeen, [])
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
394 403
395 404
396 ########## 405 ##########
397 # Heavy-lifting functions that do most of the console processing. 406 # Heavy-lifting functions that do most of the console processing.
398 # AKA postfetch and postsave functions/handlers. 407 # AKA postfetch and postsave functions/handlers.
399 ########## 408 ##########
400 def console_merger(localpath, remoteurl, page_data, 409 def console_merger(localpath, remoteurl, page_data,
401 masters_to_merge=None, num_rows_to_merge=None): 410 masters_to_merge=None, num_rows_to_merge=None):
402 masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE 411 masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE
403 num_rows_to_merge = num_rows_to_merge or 25 412 num_rows_to_merge = num_rows_to_merge or 25
404 mergedconsole = ConsoleData() 413 console_data = ConsoleData()
405 surroundings = get_and_cache_pagedata('surroundings') 414 surroundings = get_and_cache_pagedata('surroundings')
406 merged_page = BeautifulSoup(surroundings['content']) 415 merged_page = BeautifulSoup(surroundings['content'])
407 merged_tag = merged_page.find('table', 'ConsoleData') 416 merged_tag = merged_page.find('table', 'ConsoleData')
408 if merged_tag is None: 417 if merged_tag is None:
409 msg = 'console_merger("%s", "%s", "%s"): merged_tag cannot be None.' % ( 418 msg = 'console_merger("%s", "%s", "%s"): merged_tag cannot be None.' % (
410 localpath, remoteurl, page_data) 419 localpath, remoteurl, page_data)
411 logging.error(msg) 420 logging.error(msg)
412 raise Exception(msg) 421 raise Exception(msg)
413 latest_rev = int(get_and_cache_rowdata('latest_rev')['rev_number']) 422 latest_rev = int(get_and_cache_rowdata('latest_rev')['rev_number'])
414 if not latest_rev: 423 if not latest_rev:
415 logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest ' 424 logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest '
416 'revision number.' % ( 425 'revision number.' % (
417 localpath, remoteurl, page_data)) 426 localpath, remoteurl, page_data))
418 return 427 return
419 fetch_timestamp = datetime.datetime.now() 428 fetch_timestamp = datetime.datetime.now()
420 for master in masters_to_merge: 429 for master in masters_to_merge:
421 # Fetch the summary one-box-per-builder for the master. 430 # Fetch the summary one-box-per-builder for the master.
422 # If we don't get it, something is wrong, skip the master entirely. 431 # If we don't get it, something is wrong, skip the master entirely.
423 master_summary = get_and_cache_pagedata('%s/console/summary' % master) 432 master_summary = get_and_cache_pagedata('%s/console/summary' % master)
424 if not master_summary['content']: 433 if not master_summary['content']:
425 continue 434 continue
426 mergedconsole.SawMaster(master) 435 console_data.SawMaster(master)
427 # Get the categories for this builder. If the builder doesn't have any 436 # Get the categories for this builder. If the builder doesn't have any
428 # categories, just use the default empty-string category. 437 # categories, just use the default empty-string category.
429 category_list = [] 438 category_list = []
430 master_categories = get_and_cache_pagedata('%s/console/categories' % master) 439 master_categories = get_and_cache_pagedata('%s/console/categories' % master)
431 if not master_categories['content']: 440 if not master_categories['content']:
432 category_list.append('') 441 category_list.append('')
433 else: 442 else:
434 category_row = BeautifulSoup(master_categories['content']) 443 category_row = BeautifulSoup(master_categories['content'])
435 category_list = [c.text for c in category_row.findAll('td', 'DevStatus')] 444 category_list = [c.text for c in category_row.findAll('td', 'DevStatus')]
436 # Get the corresponding summary box(es). 445 # Get the corresponding summary box(es).
437 summary_row = BeautifulSoup(master_summary['content']) 446 summary_row = BeautifulSoup(master_summary['content'])
438 summary_list = summary_row.findAll('table') 447 summary_list = summary_row.findAll('table')
439 for category, summary in zip(category_list, summary_list): 448 for category, summary in zip(category_list, summary_list):
440 mergedconsole.AddCategory(category, summary) 449 console_data.AddCategory(category, summary)
441 450
442 # Fetch all of the rows that we need. 451 # Fetch all of the rows that we need.
443 rows_fetched = 0 452 rows_fetched = 0
444 revs_skipped = 0 453 revs_skipped = 0
445 current_rev = latest_rev 454 current_rev = latest_rev
446 while rows_fetched < num_rows_to_merge and current_rev >= 0: 455 while rows_fetched < num_rows_to_merge and current_rev >= 0:
447 # Don't get stuck looping backwards forever into data we don't have. 456 # Don't get stuck looping backwards forever into data we don't have.
448 # How hard we try scales with how many rows the person wants. 457 # How hard we try scales with how many rows the person wants.
449 if revs_skipped > max(num_rows_to_merge, 10): 458 if revs_skipped > max(num_rows_to_merge, 10):
450 break 459 break
451 row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev)) 460 row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev))
452 if not row_data: 461 if not row_data:
453 current_rev -= 1 462 current_rev -= 1
454 revs_skipped += 1 463 revs_skipped += 1
455 continue 464 continue
456 mergedconsole.AddRow(row_data) 465 console_data.AddRow(row_data)
457 current_rev -= 1 466 current_rev -= 1
458 revs_skipped = 0 467 revs_skipped = 0
459 rows_fetched += 1 468 rows_fetched += 1
460 469
461 # Convert the merged content into console content. 470 # Convert the merged content into console content.
462 mergedconsole.Finish() 471 console_data.Finish()
463 template_environment = Environment() 472 template_environment = Environment()
464 template_environment.loader = FileSystemLoader('.') 473 template_environment.loader = FileSystemLoader('.')
465 def notstarted(builder_status): 474 def notstarted(builder_status):
466 """Convert a BeautifulSoup Tag from builder status to a notstarted line.""" 475 """Convert a BeautifulSoup Tag from builder status to a notstarted line."""
467 builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status)) 476 builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status))
468 builder_status = re.sub(r'class=\'([^\']*)\' target=', 477 builder_status = re.sub(r'class=\'([^\']*)\' target=',
469 'class=\'DevStatusBox notstarted\' target=', 478 'class=\'DevStatusBox notstarted\' target=',
470 builder_status) 479 builder_status)
471 builder_status = re.sub(r'class="([^"]*)" target=', 480 builder_status = re.sub(r'class="([^"]*)" target=',
472 'class="DevStatusBox notstarted" target=', 481 'class="DevStatusBox notstarted" target=',
473 builder_status) 482 builder_status)
474 return builder_status 483 return builder_status
475 template_environment.filters['notstarted'] = notstarted 484 template_environment.filters['notstarted'] = notstarted
476 merged_template = template_environment.from_string(console_template) 485 merged_template = template_environment.from_string(console_template)
477 merged_content = merged_template.render(data=mergedconsole) 486 merged_console = merged_template.render(data=console_data)
478 # For debugging: 487 # For debugging:
479 # print merged_content 488 # logging.info('%r' % merged_console)
480 # import code 489 # import code
481 # code.interact(local=locals()) 490 # code.interact(local=locals())
482 491
483 # Place merged data at |merged_tag|'s location in |merged_page|, and put the 492 # Place merged console at |merged_tag|'s location in |merged_page|, and put
484 # result in |merged_content|. 493 # the result in |merged_content|.
485 merged_tag.replaceWith(str(merged_content)) 494 merged_tag.replaceWith(merged_console)
cmp 2015/02/12 18:37:16 And once ContentsToHtml wasn't downsampling to ?,
486 # .prettify() may damage the HTML but makes output nicer. However, that 495 merged_content = utf8_convert(merged_page)
487 # cost is a bunch of extra whitespace. We reduce page size by not using
488 # .prettify().
489 merged_content = merged_page.__str__(encoding=None)
cmp 2015/02/12 18:37:16 This line was causing the DTD doubling in this tes
490 merged_content = re.sub( 496 merged_content = re.sub(
491 r'\'\<a href="\'', '\'<a \' + attributes + \' href="\'', merged_content) 497 r'\'\<a href="\'', '\'<a \' + attributes + \' href="\'', merged_content)
492 merged_content = re.sub( 498 merged_content = re.sub(
493 r'\'\<table\>\'', r"'<table ' + attributes + '>'", merged_content) 499 r'\'\<table\>\'', r"'<table ' + attributes + '>'", merged_content)
494 merged_content = re.sub( 500 merged_content = re.sub(
495 r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content) 501 r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content)
496 merged_content = re.sub( 502 merged_content = re.sub(
497 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) 503 r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content)
498 merged_content = re.sub( 504 merged_content = re.sub(
499 r'\<iframe\>\</iframe\>', 505 r'\<iframe\>\</iframe\>',
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after
663 # exist more than once. Reverts are examples of commits which can contain 669 # exist more than once. Reverts are examples of commits which can contain
664 # multiple Cr-Commit-Position instances. In those cases, only the last one 670 # multiple Cr-Commit-Position instances. In those cases, only the last one
665 # is correct, so split on the break tag and reverse the result to find the 671 # is correct, so split on the break tag and reverse the result to find the
666 # last occurrence of Cr-Commit-Position. 672 # last occurrence of Cr-Commit-Position.
667 for line in reversed(commit_msg.split('<br />')): 673 for line in reversed(commit_msg.split('<br />')):
668 if line.startswith('Cr-Commit-Position: '): 674 if line.startswith('Cr-Commit-Position: '):
669 return filter(str.isdigit, str(line.split('@')[-1])) 675 return filter(str.isdigit, str(line.split('@')[-1]))
670 return '0' 676 return '0'
671 677
672 678
673 def utf8_convert(bstring):
674 # cmp also investigated:
675 # bstring.__str__(encoding='utf-8').decode('utf-8')
676 # He found that the BeautifulSoup() __str__ method when used with a 'utf-8'
677 # encoding returned effectively the same thing as str(), a Python built-in.
678 # After a handful of tests, he switched to using str() to avoid the add'l
679 # complexity of another BeautifulSoup method.
680 return str(bstring).decode('utf-8')
681
682
683 # W0613:600,28:parse_master: Unused argument 'remoteurl' 679 # W0613:600,28:parse_master: Unused argument 'remoteurl'
684 # pylint: disable=W0613 680 # pylint: disable=W0613
685 def parse_master(localpath, remoteurl, page_data=None): 681 def parse_master(localpath, remoteurl, page_data=None):
686 """Part of the new pipeline to store individual rows rather than 682 """Part of the new pipeline to store individual rows rather than
687 whole pages of html. Parses the master data into a set of rows, 683 whole pages of html. Parses the master data into a set of rows,
688 and writes them out to the datastore in an easily retrievable format. 684 and writes them out to the datastore in an easily retrievable format.
689 685
690 Doesn't modify page_data dict. 686 Doesn't modify page_data dict.
691 """ 687 """
692 ts = datetime.datetime.now() 688 ts = datetime.datetime.now()
(...skipping 558 matching lines...) Expand 10 before | Expand all | Expand 10 after
1251 'builds/-1?as_text=1'), 1247 'builds/-1?as_text=1'),
1252 'localpath': 1248 'localpath':
1253 'chromium.lkgr/json/builders/Linux%20x64/builds/-1/as_text=1.json', 1249 'chromium.lkgr/json/builders/Linux%20x64/builds/-1/as_text=1.json',
1254 'maxage': 2*60, # 2 mins 1250 'maxage': 2*60, # 2 mins
1255 }, 1251 },
1256 1252
1257 # # Trigger background process update. 1253 # # Trigger background process update.
1258 # { 1254 # {
1259 # 'remoteurl': 'http://chromium-build.appspot.com/backend/update' 1255 # 'remoteurl': 'http://chromium-build.appspot.com/backend/update'
1260 ] 1256 ]
OLDNEW
« no previous file with comments | « no previous file | appengine/chromium_build/tests/console_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698