OLD | NEW |
---|---|
(Empty) | |
1 # Copyright 2015 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
Ken Rockot(use gerrit already)
2015/05/26 00:26:24
This provides the PushData function used exclusive
| |
4 | |
5 import cPickle | |
6 import googledatastore as datastore | |
7 import logging | |
8 | |
9 from future import Future | |
10 | |
11 # N.B.: In order to use this module you should have a working cloud development | |
12 # environment configured with the googledatastore module installed. | |
13 # | |
14 # Please see https://cloud.google.com/datastore/docs/getstarted/start_python/ | |
15 | |
16 | |
17 _DATASET_NAME = 'chrome-apps-doc' | |
18 _PERSISTENT_OBJECT_KIND = 'PersistentObjectStoreItem' | |
19 _VALUE_PROPERTY_NAME = 'pickled_value' | |
20 | |
21 # The max number of entities to include in a single request. This is capped at | |
22 # 500 by the service. In practice we may send fewer due to _MAX_REQUEST_SIZE | |
23 _MAX_BATCH_SIZE = 500 | |
24 | |
25 | |
26 # The maximum entity size allowed by Datastore. | |
27 _MAX_ENTITY_SIZE = 1024*1024 | |
28 | |
29 | |
30 # The maximum request size (in bytes) to send Datastore. This is an approximate | |
31 # size based on the sum of entity blob_value sizes. | |
32 _MAX_REQUEST_SIZE = 5*1024*1024 | |
Ken Rockot(use gerrit already)
2015/05/26 00:26:24
Couldn't find any documentation for the max reques
| |
33 | |
34 | |
35 def _CreateEntity(name, value): | |
36 entity = datastore.Entity() | |
37 path = entity.key.path_element.add() | |
38 path.kind = _PERSISTENT_OBJECT_KIND | |
39 path.name = name | |
40 pickled_value_property = entity.property.add() | |
41 pickled_value_property.name = _VALUE_PROPERTY_NAME | |
42 pickled_value_property.value.indexed = False | |
43 pickled_value_property.value.blob_value = value | |
44 return entity | |
45 | |
46 | |
47 def _CreateBatches(data): | |
48 '''Constructs batches of at most _MAX_BATCH_SIZE entities to cover all | |
49 entities defined in |data| without exceeding the transaction size limit. | |
50 This is a generator emitting lists of entities. | |
51 ''' | |
52 def get_size(entity): | |
53 return len(entity.property[0].value.blob_value) | |
54 | |
55 entities = [_CreateEntity(name, value) for name, value in data.iteritems()] | |
56 batch_start = 0 | |
57 batch_end = 1 | |
58 batch_size = get_size(entities[0]) | |
59 while batch_end < len(entities): | |
60 next_size = get_size(entities[batch_end]) | |
61 if (batch_size + next_size > _MAX_REQUEST_SIZE or | |
62 batch_end - batch_start >= _MAX_BATCH_SIZE): | |
63 yield entities[batch_start:batch_end], batch_end, len(entities) | |
64 batch_start = batch_end | |
65 batch_size = 0 | |
66 else: | |
67 batch_size += next_size | |
68 batch_end = batch_end + 1 | |
69 if batch_end > batch_start and batch_start < len(entities): | |
70 yield entities[batch_start:batch_end], batch_end, len(entities) | |
71 | |
72 | |
73 def PushData(data, original_data={}): | |
74 '''Pushes a bunch of data into the datastore. The data should be a dict. Each | |
75 key is treated as a namespace, and each value is also a dict. A new datastore | |
76 entry is upserted for every inner key, with the value pickled into the | |
77 |pickled_value| field. | |
78 | |
79 For example, if given the dictionary: | |
80 | |
81 { | |
82 'fruit': { | |
83 'apple': 1234, | |
84 'banana': 'yellow', | |
85 'trolling carrot': { 'arbitrarily complex': ['value', 'goes', 'here'] } | |
86 }, | |
87 'animal': { | |
88 'sheep': 'baaah', | |
89 'dog': 'woof', | |
90 'trolling cat': 'moo' | |
91 } | |
92 } | |
93 | |
94 this would result in a push of 6 keys in total, with the following IDs: | |
95 | |
96 Key('PersistentObjectStoreItem', 'fruit/apple') | |
97 Key('PersistentObjectStoreItem', 'fruit/banana') | |
98 Key('PersistentObjectStoreItem', 'fruit/trolling carrot') | |
99 Key('PersistentObjectStoreItem', 'animal/sheep') | |
100 Key('PersistentObjectStoreItem', 'animal/dog') | |
101 Key('PersistentObjectStoreItem', 'animal/trolling cat') | |
102 | |
103 If given |original_data|, this will only push key-value pairs for entries that | |
104 are either new or have changed from their original (pickled) value. | |
105 | |
106 Caveat: Pickling and unpickling a dictionary can (but does not always) change | |
107 its key order. This means that objects will often be seen as changed even when | |
108 they haven't changed. | |
109 ''' | |
110 datastore.set_options(dataset=_DATASET_NAME) | |
111 | |
112 def flatten(dataset): | |
113 flat = {} | |
114 for namespace, items in dataset.iteritems(): | |
115 for k, v in items.iteritems(): | |
116 flat['%s/%s' % (namespace, k)] = cPickle.dumps(v) | |
117 return flat | |
118 | |
119 logging.info('Flattening data sets...') | |
120 data = flatten(data) | |
121 original_data = flatten(original_data) | |
122 | |
123 logging.info('Culling new data...') | |
124 for k in data.keys(): | |
125 if ((k in original_data and original_data[k] == data[k]) or | |
126 (len(data[k]) > _MAX_ENTITY_SIZE)): | |
127 del data[k] | |
Ken Rockot(use gerrit already)
2015/05/26 00:26:24
This should be super awesome and get us very tiny
not at google - send to devlin
2015/06/04 22:40:45
What about an OrderedDict? That should pickle, and
Ken Rockot(use gerrit already)
2015/06/05 00:21:50
No, I think that's the right approach. I didn't do
| |
128 | |
129 for batch, n, total in _CreateBatches(data): | |
130 commit_request = datastore.CommitRequest() | |
131 commit_request.mode = datastore.CommitRequest.NON_TRANSACTIONAL | |
132 commit_request.mutation.upsert.extend(list(batch)) | |
133 | |
134 logging.info('Committing %s/%s entities...' % (n, total)) | |
135 datastore.commit(commit_request) | |
OLD | NEW |