appengine/findit/util_scripts/crash_queries/delta_test/delta_test.py - Issue 2400283003: [Findit] Add skeleton code for delta test script.

Side by Side Diff: appengine/findit/util_scripts/crash_queries/delta_test/delta_test.py

Issue 2400283003: [Findit] Add skeleton code for delta test script. (Closed)

Patch Set: seperate --date arguments and some renames. Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « appengine/findit/util_scripts/crash_queries/delta_test/__init__.py ('k') | appengine/findit/util_scripts/crash_queries/delta_test/delta_util.py » ('j') | appengine/findit/util_scripts/crash_queries/delta_test/run-delta-test.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 # Copyright 2016 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 import json

	6 import logging

	7 import os

	8 import pickle

	9 import subprocess

	10

	11 from crash_queries import crash_iterator

	12 from crash_queries.delta_test import delta_util

	13

	14 AZALEA_RESULTS_DIRECTORY = os.path.join(os.path.dirname(__file__),

	15 'azalea_results')

	16

	17

	18 class Delta(object):

	19 """Stands for delta between two results."""

	20

	21 def __init__(self, result1, result2, fields):

	22 self._result1 = result1

	23 self._result2 = result2

	24 self._fields = fields

	25 self._delta_dict = {}

	26 self._delta_str_dict = {}

	27

	28 @property

	29 def delta_dict(self):

	30 """Dict representation of delta."""

	31 if self._delta_dict:

	32 return self._delta_dict

	33

	34 for field in self._fields:

	35 value1 = getattr(self._result1, field)

	36 value2 = getattr(self._result2, field)

	37 if value1 != value2:

	38 if hasattr(value1, 'ToDict') and callable(value1.ToDict):

	39 value1 = value1.ToDict()

	40 value2 = value2.ToDict()

	41 self._delta_dict[field] = (value1, value2)

	42

	43 return self._delta_dict

	44

	45 @property

	46 def delta_str_dict(self):

	47 """Converts delta of each field to a string."""

	48 if self._delta_str_dict:

	49 return self._delta_str_dict

	50

	51 for key, (value1, value2) in self.delta_dict.iteritems():

	52 self._delta_str_dict[key] = '%s: %s, %s' % (key, value1, value2)

	53

	54 return self._delta_str_dict

	55

	56 def ToDict(self):

	57 return self.delta_dict

	58

	59 def __str__(self):

	60 return '\n'.join(self.delta_str_dict.values())

	61

	62 def __bool__(self):

	63 return bool(self.delta_dict)

	64

	65 def __nonzero__(self):

	66 return self.__bool__()

	67

	68

	69 def GetDeltasFromTwoSetsOfResults(set1, set2):

	70 """Gets delta from two sets of results.

	71

	72 Results are a list of (message, matches, component_name, cr_label)

	73 Returns a list of delta results (results1, results2).

	74 """

	75 deltas = {}

	76 for crash_id, result1 in set1.iteritems():

	77 # Even when the command are exactly the same, it's possible that one set is

	78 # loaded from local result file, another is just queried from database,

	79 # sometimes some crash results would get deleted.

	80 if crash_id not in set2:

	81 continue

	82

	83 result2 = set2[crash_id]

	84 delta = Delta(result1, result2, result1.fields)

	85 if delta:

	86 deltas[crash_id] = delta

	87

	88 return deltas

	89

	90

	91 def GetResults(crashes, git_hash, result_path, verbose=False):

	92 """Returns an evaluator function to compute delta between 2 findit githashes.

	93

	94 Args:

	95 crashes (list): A list of crash infos.

	96 git_hash (str): A git hash of findit repository.

	97 result_path (str): file path for subprocess to write results on.

	98 verbose (bool): If True, print all the findit results.

	99

	100 Return:

	101 A dict mapping crash id to culprit for every crashes analyzed by

	102 git_hash version.

	103 """

	104 if not crashes:

	105 return {}

	106

	107 if verbose:

	108 logging.info('\n\n***************************')

	109 logging.info('Switch to git %s', git_hash)

	110 logging.info('***************************\n\n')

	111

	112 dev_null_handle = open(os.devnull, 'w')

	113 subprocess.check_call(

	114 'cd %s; git checkout %s' % (os.path.dirname(__file__), git_hash),

	115 stdout=dev_null_handle,

	116 stderr=dev_null_handle,

	117 shell=True)

	118

	119 if not os.path.exists(result_path):

	120 # TODO(katesoina): Implement run-azalea.py.
	stgao 2016/10/14 01:40:40 Is the TODO still valid? Is the TODO still valid? Sharu Jiang 2016/10/15 01:24:47 run-azalea.py is not finished yet, however, right, Show quoted text On 2016/10/14 01:40:40, stgao wrote: > Is the TODO still valid? run-azalea.py is not finished yet, however, right, better add Todo there instead of here.
	121 command = 'python %s %s' % ('run-azalea.py', result_path) + (
	stgao 2016/10/14 01:40:40 what's the current working dir? Will it matter her what's the current working dir? Will it matter here? Sharu Jiang 2016/10/15 01:24:47 In line 114, we make sure the current working dir Show quoted text On 2016/10/14 01:40:40, stgao wrote: > what's the current working dir? Will it matter here? In line 114, we make sure the current working dir is the delta test dir. stgao 2016/10/20 01:40:20 Is this new process run in the same shell as that Show quoted text On 2016/10/15 01:24:47, sharu jiang wrote: > On 2016/10/14 01:40:40, stgao wrote: > > what's the current working dir? Will it matter here? > > In line 114, we make sure the current working dir is the delta test dir. Is this new process run in the same shell as that process in #115 above? Did the `cd DIR` really change the current working dir? Have you tried in a command line? Sharu Jiang 2016/10/20 22:39:06 Yes, the run-azalea.py is in the same dir as this Show quoted text On 2016/10/20 01:40:20, stgao wrote: > On 2016/10/15 01:24:47, sharu jiang wrote: > > On 2016/10/14 01:40:40, stgao wrote: > > > what's the current working dir? Will it matter here? > > > > In line 114, we make sure the current working dir is the delta test dir. > > Is this new process run in the same shell as that process in #115 above? Did the > `cd DIR` really change the current working dir? Have you tried in a command > line? Yes, the run-azalea.py is in the same dir as this file(delta test dir). I tested the skeleton delta test in command line, it works.
	122 ' -v' if verbose else '')
	stgao 2016/10/14 01:40:40 Can we use a argument list instead of a string? Can we use a argument list instead of a string? Sharu Jiang 2016/10/15 01:24:47 Done. Show quoted text On 2016/10/14 01:40:40, stgao wrote: > Can we use a argument list instead of a string? Done.
	123 # Results is a dict with testcase_id as key, and findit results as

	124 # value.

	125 p = subprocess.Popen(

	126 command,

	127 stdin=subprocess.PIPE,

	128 shell=True)

	129

	130 p.communicate(input=json.dumps(crashes))
	stgao 2016/10/14 01:40:40 Should we pass information through a file instead? Should we pass information through a file instead? Sharu Jiang 2016/10/15 01:24:47 We can, we can cache the CrashIterator results(bat Show quoted text On 2016/10/14 01:40:40, stgao wrote: > Should we pass information through a file instead? We can, we can cache the CrashIterator results(batches of crashes) and let run-azalea read those cache, but as I stated in the design doc https://docs.google.com/document/d/1kfu_HIJFCSwOWQleRpSkOH4t_D0UFY5vDSSausaZu..., this may caused some disk space since whenever args changed, we need to cache those crash_infos (it's relatively big compared to culprit result) again. I can do this in another cl. stgao 2016/10/20 01:40:20 I don't quite understand this. Maybe we could chat Show quoted text On 2016/10/15 01:24:47, sharu jiang wrote: > On 2016/10/14 01:40:40, stgao wrote: > > Should we pass information through a file instead? > > We can, we can cache the CrashIterator results(batches of crashes) and let > run-azalea read those cache, but as I stated in the design doc > https://docs.google.com/document/d/1kfu_HIJFCSwOWQleRpSkOH4t_D0UFY5vDSSausaZu..., > this may caused some disk space since whenever args changed, we need to cache > those crash_infos (it's relatively big compared to culprit result) again. I don't quite understand this. Maybe we could chat in person. Show quoted text > > I can do this in another cl. If this is the plan, let's add a TODO with a bug. Sharu Jiang 2016/10/20 22:39:06 Done. Show quoted text On 2016/10/20 01:40:20, stgao wrote: > On 2016/10/15 01:24:47, sharu jiang wrote: > > On 2016/10/14 01:40:40, stgao wrote: > > > Should we pass information through a file instead? > > > > We can, we can cache the CrashIterator results(batches of crashes) and let > > run-azalea read those cache, but as I stated in the design doc > > > https://docs.google.com/document/d/1kfu_HIJFCSwOWQleRpSkOH4t_D0UFY5vDSSausaZu..., > > this may caused some disk space since whenever args changed, we need to cache > > those crash_infos (it's relatively big compared to culprit result) again. > > I don't quite understand this. Maybe we could chat in person. > > > > > I can do this in another cl. > > If this is the plan, let's add a TODO with a bug. Done.
	131 else:

	132 logging.info('\nLoading results from %s', result_path)

	133

	134 if not os.path.exists(result_path):

	135 logging.info('Fail to get results.')
	stgao 2016/10/14 01:40:40 Should it be an error or info? Same for other usag Should it be an error or info? Same for other usage of logging. Sharu Jiang 2016/10/15 01:24:47 Done. Show quoted text On 2016/10/14 01:40:40, stgao wrote: > Should it be an error or info? Same for other usage of logging. Done.
	136 return {}

	137

	138 with open(result_path) as f:

	139 return pickle.load(f)
	stgao 2016/10/14 01:40:40 Just curious: why we use pickle instead of json? Just curious: why we use pickle instead of json? Sharu Jiang 2016/10/15 01:24:47 The results are a general concept, it can be a obj Show quoted text On 2016/10/14 01:40:40, stgao wrote: > Just curious: why we use pickle instead of json? The results are a general concept, it can be a object like Culprit object. stgao 2016/10/20 01:40:20 Acknowledged. Show quoted text On 2016/10/15 01:24:47, sharu jiang wrote: > On 2016/10/14 01:40:40, stgao wrote: > > Just curious: why we use pickle instead of json? > > The results are a general concept, it can be a object like Culprit object. Acknowledged.
	140

	141 return {}

	142

	143

	144 def DeltaEvaluator(git_hash1, git_hash2,

	145 client_id, start_date, end_date, batch_size,

	146 property_values=None, verbose=False, app_id=None):

	147 """Evaluates delta between git_hash1 and git_hash2 on a set of Testcases.

	148

	149 Args:

	150 git_hash1 (str): A git hash of findit repository.

	151 git_hash2 (str): A git hash of findit repository.

	152 start_date (str): Run delta test on testcases after (including)

	153 the start_date, format should be '%Y-%m-%d'.

	154 end_date (str): Run delta test on testcases before (not including)

	155 the end_date, format should be '%Y-%m-%d'.

	156 client_id (CrashClient): Possible values are 'fracas', 'cracas',

	157 'cluterfuzz'.

	158 batch_size (int): Size of a batch that can be queried at one time.

	159 property_values (dict): Property values to query.

	160 batch_size (int): The size of crashes that can be queried at one time.

	161 verbose (bool): If True, print all the findit results.

	162 app_id (str): Appengine app id to query.

	163 Return:

	164 (deltas, crash_count).

	165 deltas (dict): Mappings id to delta for each culprit value.

	166 crash_count (int): Total count of all the crashes.

	167 """

	168 head_branch_name = subprocess.check_output(

	169 ['git', 'rev-parse', '--abbrev-ref', 'HEAD']).replace('\n', '')

	170 deltas = {}

	171 try:

	172 crash_count = 0

	173 for index, crashes in enumerate(

	174 crash_iterator.IterateCrashes(client_id,

	175 property_values=property_values,

	176 start_date=start_date,

	177 end_date=end_date,

	178 batch_size=batch_size,

	179 batch_run=True,

	180 app_id=app_id)):

	181

	182 results = []

	183 for git_hash in [git_hash1, git_hash2]:
	stgao 2016/10/14 01:40:40 So for each crash, we switch the checkout twice. I So for each crash, we switch the checkout twice. If we have 100 crashes, we have to switch checkout 200 times. Can we avoid such an overhead? Sharu Jiang 2016/10/15 01:24:47 This is not for each crash, it is for each batch o Show quoted text On 2016/10/14 01:40:40, stgao wrote: > So for each crash, we switch the checkout twice. If we have 100 crashes, we have > to switch checkout 200 times. > > Can we avoid such an overhead? This is not for each crash, it is for each batch of 1000 crashes (batch_run=True). So I think the overhead should be ok. stgao 2016/10/20 01:40:20 sg Show quoted text On 2016/10/15 01:24:47, sharu jiang wrote: > On 2016/10/14 01:40:40, stgao wrote: > > So for each crash, we switch the checkout twice. If we have 100 crashes, we > have > > to switch checkout 200 times. > > > > Can we avoid such an overhead? > > This is not for each crash, it is for each batch of 1000 crashes > (batch_run=True). So I think the overhead should be ok. sg wrengr 2016/10/24 17:30:36 It'd still be good (and easy!) to avoid. I can add Show quoted text On 2016/10/15 01:24:47, sharu jiang wrote: > On 2016/10/14 01:40:40, stgao wrote: > > So for each crash, we switch the checkout twice. If we have 100 crashes, we > have > > to switch checkout 200 times. > > > > Can we avoid such an overhead? > > This is not for each crash, it is for each batch of 1000 crashes > (batch_run=True). So I think the overhead should be ok. It'd still be good (and easy!) to avoid. I can add it in another CL once this one finally lands. Sharu Jiang 2016/10/24 19:14:32 Can we discuss the solution offline then? The mai Show quoted text On 2016/10/24 17:30:36, wrengr wrote: > On 2016/10/15 01:24:47, sharu jiang wrote: > > On 2016/10/14 01:40:40, stgao wrote: > > > So for each crash, we switch the checkout twice. If we have 100 crashes, we > > have > > > to switch checkout 200 times. > > > > > > Can we avoid such an overhead? > > > > This is not for each crash, it is for each batch of 1000 crashes > > (batch_run=True). So I think the overhead should be ok. > > It'd still be good (and easy!) to avoid. I can add it in another CL once this > one finally lands. Can we discuss the solution offline then? The main reasons I want to get delta result batch by batch are 1) we can print intermediate delta result to users (will print that in another cl), especially when users query a long period of crash analyses, it may take long time to see the final results. 2) we can also avoid caching a very big chunk of crashes in memory (or writing to disks), especially when users query a long period of crash analyses.
	184 result_path = os.path.join(

	185 AZALEA_RESULTS_DIRECTORY, delta_util.GenerateResultFileName(

	186 client_id, property_values, start_date, end_date,

	187 batch_size, index, git_hash))

	188 results.append(GetResults(crashes, git_hash, result_path,

	189 verbose=verbose))

	190

	191 crash_count += len(crashes)

	192 deltas.update(GetDeltasFromTwoSetsOfResults(*results))

	193

	194 return deltas, crash_count

	195 finally:

	196 dev_null_handle = open(os.devnull, 'w')

	197 subprocess.check_call(['git', 'checkout', head_branch_name],

	198 stdout=dev_null_handle,

	199 stderr=dev_null_handle)

OLD	NEW