build/android/pylib/perf/setup.py - Issue 1148873007: Fix last_devices to be quieter, and improve device affinity.

Side by Side Diff: build/android/pylib/perf/setup.py

Issue 1148873007: Fix last_devices to be quieter, and improve device affinity. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Improve diagram Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2013 The Chromium Authors. All rights reserved.	1 # Copyright 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Generates test runner factory and tests for performance tests."""	5 """Generates test runner factory and tests for performance tests."""

6	6

7 import json	7 import json

8 import fnmatch	8 import fnmatch

	9 import hashlib

9 import logging	10 import logging

10 import os	11 import os

11 import shutil	12 import shutil

12	13

13 from pylib import constants	14 from pylib import constants

14 from pylib import forwarder	15 from pylib import forwarder

15 from pylib.device import device_list	16 from pylib.device import device_list

16 from pylib.device import device_utils	17 from pylib.device import device_utils

17 from pylib.perf import test_runner	18 from pylib.perf import test_runner

18 from pylib.utils import test_environment	19 from pylib.utils import test_environment

19	20

20	21

21 def _GetAllDevices():

22 devices_path = os.path.join(os.environ.get('CHROMIUM_OUT_DIR', 'out'),

23 device_list.LAST_DEVICES_FILENAME)

24 try:

25 devices = [device_utils.DeviceUtils(s)

26 for s in device_list.GetPersistentDeviceList(devices_path)]

27 except IOError as e:

28 logging.error('Unable to find %s [%s]', devices_path, e)

29 devices = device_utils.DeviceUtils.HealthyDevices()

30 return sorted(devices)

31

32

33 def _GetStepsDictFromSingleStep(test_options):	22 def _GetStepsDictFromSingleStep(test_options):

34 # Running a single command, build the tests structure.	23 # Running a single command, build the tests structure.

35 steps_dict = {	24 steps_dict = {

36 'version': 1,	25 'version': 1,

37 'steps': {	26 'steps': {

38 'single_step': {	27 'single_step': {

39 'device_affinity': 0,	28 'device_affinity': 0,

40 'cmd': test_options.single_step	29 'cmd': test_options.single_step

41 },	30 },

42 }	31 }

43 }	32 }

44 return steps_dict	33 return steps_dict

45	34

46	35

47 def _GetStepsDict(test_options):	36 def _GetStepsDict(test_options):

48 if test_options.single_step:	37 if test_options.single_step:

49 return _GetStepsDictFromSingleStep(test_options)	38 return _GetStepsDictFromSingleStep(test_options)

50 if test_options.steps:	39 if test_options.steps:

51 with file(test_options.steps, 'r') as f:	40 with file(test_options.steps, 'r') as f:

52 steps = json.load(f)	41 steps = json.load(f)

53	42

54 # Already using the new format.	43 # Already using the new format.

55 assert steps['version'] == 1	44 assert steps['version'] == 1

56 return steps	45 return steps

57	46

58	47

	48 def DistributeAffinities(devices):

	49 """Create a mapping from device to affinity using a round-robin hash scheme.

	50

	51 Args:

	52 devices: a list of device serials.

	53

	54 Returns: a dictionary mapping each of the given device serials to a list

	55 of affinities (numbers between 0 and NUM_DEVICE_AFFINITIES-1) it should

	56 run.

	57

	58 This function implements an algorithm to assign affinities to devices in a

	59 relatively stable way while still maximizing the use of resources.

	60

	61 We begin with all affinities unassigned. We cycle through the list of device

	62 serials, and for each one we assign a yet-unassigned affinity. We choose the

	63 affinity to assign by hashing the device serial, so that changes in the set of

	64 devices will produce minimal changes in the assignments. We resolve

	65 collisions by rehashing the hash that produced the collision (which will be

	66 unique, since it is an md5 hash which we squish down).

	67

	68 Another way to think of this: each device corresponds to a random series of

	69 affinities, then we cycle through the devices giving each device its first

	70 available choice. We expect the first few devices in the list to get their

	71 first choice (so they will definitely be stable). Stability is only in danger

	72 if a change to the device list produces a cascade of devices getting a

	73 different choice than they had before.

	74

	75 The probability that a later choice is directly disrupted by an earlier change

	76 is 1/A, since there are A different assignments to be chosen and hash

	77 functions distribute uniformly. However, there are transitive disruptions: a

	78 change in the 1st choice might cause a change in the 2nd choice, which in turn

	79 causes a change in the 3rd choice. Call the probability that a the n'th choice

	80 is disrupted by changing i'th choice D(i,n) (assuming i < n). We have the

	81 relation:

	82

	83 D(i,n) = 1 - (1 - 1/A) product from j=i+1 to n-1 of { 1 - D(i,j) D(j,n) }

	84 ^^^ ^^^^^^^^^^^^^

	85 Disrupted directly Transitive disruptions

	86

	87 From numerical solutions with A=8, the worst case D(1,8) = 26% (the

	88 probability of a device added lexicographically first disrupting the last

	89 device in the cycle). The average probability of a choice being affected by a

	90 change is 16%.

	91 """

	92 if not devices:

	93 return {}

	94 devices = sorted(devices)

	95

	96 device_cycle = []

	97 while len(device_cycle) < test_runner.NUM_DEVICE_AFFINITIES:

	98 device_cycle.extend(devices)

	99 device_cycle = device_cycle[:test_runner.NUM_DEVICE_AFFINITIES]

	100
	navabi 2015/05/27 23:36:20 Ok, so here we have something like this: device_cy Ok, so here we have something like this: device_cycle = [s1, s2, s3, s1, s2, s3, s1, s2] where the length of device_cycle is NUM_DEVICE_AFFINITIES Can you explain to me why this hashing/rehashing is better than just assigning affinity[x] = device_cycle[x]. I.e. assign device at index i to affinity i? It was my impression that the problem was just that we don't reassign device affinity when we needed to. (I.e. when devices go missing and/or when we find new devices). It looks like your code now does the DistributeAffinity when you need to, but I don't understand the need for the extra complexity of this hashing solution. Is it so that the same tests don't continue to get screwed every time a device goes missing or comes online, and this spreads out which tests get effected? luqui 2015/05/28 00:59:54 Imagine we have devices H J M, thus the affinity a Show quoted text On 2015/05/27 23:36:20, navabi wrote: > Ok, so here we have something like this: > device_cycle = [s1, s2, s3, s1, s2, s3, s1, s2] > where the length of device_cycle is NUM_DEVICE_AFFINITIES > > Can you explain to me why this hashing/rehashing is better than just assigning > affinity[x] = device_cycle[x]. I.e. assign device at index i to affinity i? Imagine we have devices H J M, thus the affinity assignments are H - 0 3 6 J - 1 4 7 M - 2 5 Now we add a new device I, which redistributes H - 0 4 # was 0 3 6 I - 1 5 # new device J - 2 6 # was 1 4 7 M - 3 7 # was 2 5 The only test which stayed on its device was 0. When the number of devices changes it is really bad like this. It's better when we replace a device with one that sorts differently -- then any tests on devices that are lexicographically between the old and new serials will be reassigned, on average 33%. Show quoted text > It was my impression that the problem was just that we don't reassign device > affinity when we needed to. (I.e. when devices go missing and/or when we find > new devices). > > It looks like your code now does the DistributeAffinity when you need to, but I > don't understand the need for the extra complexity of this hashing solution. The extra algorithmic complexity of this hashing solution decreases the complexity of the system as a whole. Previously affinity distribution depended on the entire set of devices we had previously seen -- whenever that changes, affinity distribution changes. So if the bot is clobbered all the affinities get reassigned. First I wanted it to be pure, so it only depended on the set of online devices and no historical state. That would be achieved by your simpler solution. But secondly I wanted it to be "continuous" -- so small changes in the set of online devices produced small changes in the affinity assignment. That is the reason for the complexity. shatch 2015/05/28 13:59:18 This is a really neat solution, but I'm also worri Show quoted text On 2015/05/28 00:59:54, luqui wrote: > On 2015/05/27 23:36:20, navabi wrote: > > Ok, so here we have something like this: > > device_cycle = [s1, s2, s3, s1, s2, s3, s1, s2] > > where the length of device_cycle is NUM_DEVICE_AFFINITIES > > > > Can you explain to me why this hashing/rehashing is better than just assigning > > affinity[x] = device_cycle[x]. I.e. assign device at index i to affinity i? > > > Imagine we have devices H J M, thus the affinity assignments are > > H - 0 3 6 > J - 1 4 7 > M - 2 5 > > Now we add a new device I, which redistributes > > H - 0 4 # was 0 3 6 > I - 1 5 # new device > J - 2 6 # was 1 4 7 > M - 3 7 # was 2 5 > > The only test which stayed on its device was 0. When the number of devices > changes it is really bad like this. It's better when we replace a device with > one that sorts differently -- then any tests on devices that are > lexicographically between the old and new serials will be reassigned, on average > 33%. > > > It was my impression that the problem was just that we don't reassign device > > affinity when we needed to. (I.e. when devices go missing and/or when we find > > new devices). > > > > It looks like your code now does the DistributeAffinity when you need to, but > I > > don't understand the need for the extra complexity of this hashing solution. > > The extra algorithmic complexity of this hashing solution decreases the > complexity of the system as a whole. Previously affinity distribution depended > on the entire set of devices we had previously seen -- whenever that changes, > affinity distribution changes. So if the bot is clobbered all the affinities > get reassigned. First I wanted it to be pure, so it only depended on the set of > online devices and no historical state. That would be achieved by your simpler > solution. But secondly I wanted it to be "continuous" -- so small changes in the > set of online devices produced small changes in the affinity assignment. That > is the reason for the complexity. This is a really neat solution, but I'm also worried about the extra complexity. Wouldn't it be simpler and easier to understand if we just persisted the previous devices + affinities, and new devices just fill in for missing ones? What are the downsides to that vs this solution, keeping historical state? jbudorick 2015/05/28 14:03:16 I'm going to come back to do another full review l Show quoted text On 2015/05/28 at 13:59:18, shatch wrote: > On 2015/05/28 00:59:54, luqui wrote: > > On 2015/05/27 23:36:20, navabi wrote: > > > Ok, so here we have something like this: > > > device_cycle = [s1, s2, s3, s1, s2, s3, s1, s2] > > > where the length of device_cycle is NUM_DEVICE_AFFINITIES > > > > > > Can you explain to me why this hashing/rehashing is better than just assigning > > > affinity[x] = device_cycle[x]. I.e. assign device at index i to affinity i? > > > > > > Imagine we have devices H J M, thus the affinity assignments are > > > > H - 0 3 6 > > J - 1 4 7 > > M - 2 5 > > > > Now we add a new device I, which redistributes > > > > H - 0 4 # was 0 3 6 > > I - 1 5 # new device > > J - 2 6 # was 1 4 7 > > M - 3 7 # was 2 5 > > > > The only test which stayed on its device was 0. When the number of devices > > changes it is really bad like this. It's better when we replace a device with > > one that sorts differently -- then any tests on devices that are > > lexicographically between the old and new serials will be reassigned, on average > > 33%. > > > > > It was my impression that the problem was just that we don't reassign device > > > affinity when we needed to. (I.e. when devices go missing and/or when we find > > > new devices). > > > > > > It looks like your code now does the DistributeAffinity when you need to, but > > I > > > don't understand the need for the extra complexity of this hashing solution. > > > > The extra algorithmic complexity of this hashing solution decreases the > > complexity of the system as a whole. Previously affinity distribution depended > > on the entire set of devices we had previously seen -- whenever that changes, > > affinity distribution changes. So if the bot is clobbered all the affinities > > get reassigned. First I wanted it to be pure, so it only depended on the set of > > online devices and no historical state. That would be achieved by your simpler > > solution. But secondly I wanted it to be "continuous" -- so small changes in the > > set of online devices produced small changes in the affinity assignment. That > > is the reason for the complexity. > > This is a really neat solution, but I'm also worried about the extra complexity. I'm going to come back to do another full review later, but I just want to add that I completely agree with this sentiment. Show quoted text > > Wouldn't it be simpler and easier to understand if we just persisted the previous devices + affinities, and new devices just fill in for missing ones? What are the downsides to that vs this solution, keeping historical state?
	101 affinity_to_device = {}

	102

	103 # Hash and rehash each device ID until we get a unique affinity.

	104 for device in device_cycle:

	105 cur_hash = device

	106 while True:

	107 cur_hash = hashlib.md5(cur_hash).hexdigest()

	108 affinity = int(int(cur_hash, 16) % test_runner.NUM_DEVICE_AFFINITIES)

	109 if affinity not in affinity_to_device:

	110 affinity_to_device[affinity] = device

	111 break

	112

	113 # Invert the dictionary we just built.

	114 device_to_affinity = {}

	115 for k, v in affinity_to_device.iteritems():

	116 if v not in device_to_affinity:

	117 device_to_affinity[v] = []

	118 device_to_affinity[v].append(k)

	119

	120 return device_to_affinity

	121

59 def Setup(test_options):	122 def Setup(test_options):

60 """Create and return the test runner factory and tests.	123 """Create and return the test runner factory and tests.

61	124

62 Args:	125 Args:

63 test_options: A PerformanceOptions object.	126 test_options: A PerformanceOptions object.

64	127

65 Returns:	128 Returns:

66 A tuple of (TestRunnerFactory, tests, devices).	129 A tuple of (TestRunnerFactory, tests, devices).

67 """	130 """

68 # TODO(bulach): remove this once the bot side lands. BUG=318369	131 # TODO(bulach): remove this once the bot side lands. BUG=318369

69 constants.SetBuildType('Release')	132 constants.SetBuildType('Release')

70 if os.path.exists(constants.PERF_OUTPUT_DIR):	133 if os.path.exists(constants.PERF_OUTPUT_DIR):

71 shutil.rmtree(constants.PERF_OUTPUT_DIR)	134 shutil.rmtree(constants.PERF_OUTPUT_DIR)

72 os.makedirs(constants.PERF_OUTPUT_DIR)	135 os.makedirs(constants.PERF_OUTPUT_DIR)

73	136

74 # Before running the tests, kill any leftover server.	137 # Before running the tests, kill any leftover server.

75 test_environment.CleanupLeftoverProcesses()	138 test_environment.CleanupLeftoverProcesses()

76	139

77 # We want to keep device affinity, so return all devices ever seen.	140 all_devices = [d.adb.GetDeviceSerial()

78 all_devices = _GetAllDevices()	141 for d in device_utils.DeviceUtils.HealthyDevices()]

	142 affinity_map = DistributeAffinities(all_devices)

79	143

80 steps_dict = _GetStepsDict(test_options)	144 steps_dict = _GetStepsDict(test_options)

81 sorted_step_names = sorted(steps_dict['steps'].keys())	145 sorted_step_names = sorted(steps_dict['steps'].keys())

82	146

83 if test_options.test_filter:	147 if test_options.test_filter:

84 sorted_step_names = fnmatch.filter(sorted_step_names,	148 sorted_step_names = fnmatch.filter(sorted_step_names,

85 test_options.test_filter)	149 test_options.test_filter)

86	150

87 flaky_steps = []	151 flaky_steps = []

88 if test_options.flaky_steps:	152 if test_options.flaky_steps:

89 with file(test_options.flaky_steps, 'r') as f:	153 with file(test_options.flaky_steps, 'r') as f:

90 flaky_steps = json.load(f)	154 flaky_steps = json.load(f)

91	155

92 def TestRunnerFactory(device, shard_index):	156 def TestRunnerFactory(device, _):

	157 affinities = affinity_map[device]

93 return test_runner.TestRunner(	158 return test_runner.TestRunner(

94 test_options, device, shard_index, len(all_devices),	159 test_options, device, affinities, steps_dict, flaky_steps)

95 steps_dict, flaky_steps)

96	160

97 return (TestRunnerFactory, sorted_step_names, all_devices)	161 return (TestRunnerFactory, sorted_step_names, all_devices)

OLD	NEW

« build/android/buildbot/bb_device_status_check.py ('K') | « build/android/pylib/device/device_list.py ('k') | build/android/pylib/perf/test_runner.py » ('j') | no next file with comments »