OLD | NEW |
1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import cPickle | 5 import cPickle |
6 import googledatastore as datastore | |
7 import logging | 6 import logging |
8 | 7 |
9 from future import Future | 8 from future import Future |
| 9 from gcloud import datastore |
10 | 10 |
11 # N.B.: In order to use this module you should have a working cloud development | 11 # N.B.: In order to use this module you should have a working cloud development |
12 # environment configured with the googledatastore module installed. | 12 # environment configured with the googledatastore module installed. |
13 # | 13 # |
14 # Please see https://cloud.google.com/datastore/docs/getstarted/start_python/ | 14 # Please see https://cloud.google.com/datastore/docs/getstarted/start_python/ |
15 | 15 |
16 | 16 |
17 _DATASET_NAME = 'chrome-apps-doc' | 17 _PROJECT_NAME = 'chrome-apps-doc' |
18 _PERSISTENT_OBJECT_KIND = 'PersistentObjectStoreItem' | 18 _PERSISTENT_OBJECT_KIND = 'PersistentObjectStoreItem' |
19 _VALUE_PROPERTY_NAME = 'pickled_value' | 19 _VALUE_PROPERTY_NAME = 'pickled_value' |
20 | 20 |
21 # The max number of entities to include in a single request. This is capped at | 21 # The max number of entities to include in a single request. This is capped at |
22 # 500 by the service. In practice we may send fewer due to _MAX_REQUEST_SIZE | 22 # 500 by the service. In practice we may send fewer due to _MAX_REQUEST_SIZE |
23 _MAX_BATCH_SIZE = 500 | 23 _MAX_BATCH_SIZE = 500 |
24 | 24 |
25 | 25 |
26 # The maximum entity size allowed by Datastore. | 26 # The maximum entity size allowed by Datastore. |
27 _MAX_ENTITY_SIZE = 1024*1024 | 27 _MAX_ENTITY_SIZE = 1024*1024 |
28 | 28 |
29 | 29 |
30 # The maximum request size (in bytes) to send Datastore. This is an approximate | 30 # The maximum request size (in bytes) to send Datastore. This is an approximate |
31 # size based on the sum of entity blob_value sizes. | 31 # size based on the sum of entity blob_value sizes. |
32 _MAX_REQUEST_SIZE = 5*1024*1024 | 32 _MAX_REQUEST_SIZE = 5*1024*1024 |
33 | 33 |
34 | 34 |
35 def _CreateEntity(name, value): | 35 def _CreateEntity(client, name, value): |
36 entity = datastore.Entity(exclude_from_indexes=[_VALUE_PROPERTY_NAME]) | 36 key = client.key(_PERSISTENT_OBJECT_KIND, name) |
37 path = entity.key.path.add() | 37 entity = datastore.Entity( |
38 path.kind = _PERSISTENT_OBJECT_KIND | 38 key=key, exclude_from_indexes=[_VALUE_PROPERTY_NAME]) |
39 path.name = name | 39 entity[_VALUE_PROPERTY_NAME] = value |
40 entity.update({_VALUE_PROPERTY_NAME: value}) | |
41 return entity | 40 return entity |
42 | 41 |
43 | 42 |
44 def _CreateBatches(data): | 43 def _CreateBatches(client, data): |
45 '''Constructs batches of at most _MAX_BATCH_SIZE entities to cover all | 44 '''Constructs batches of at most _MAX_BATCH_SIZE entities to cover all |
46 entities defined in |data| without exceeding the transaction size limit. | 45 entities defined in |data| without exceeding the transaction size limit. |
47 This is a generator emitting lists of entities. | 46 This is a generator emitting lists of entities. |
48 ''' | 47 ''' |
49 def get_size(entity): | 48 def get_size(entity): |
50 return len(entity.properties[_VALUE_PROPERTY_NAME].value.blob_value) | 49 return len(entity[_VALUE_PROPERTY_NAME]) |
51 | 50 |
52 entities = [_CreateEntity(name, value) for name, value in data.iteritems()] | 51 entities = [_CreateEntity(client, name, value) |
| 52 for name, value in data.iteritems()] |
53 batch_start = 0 | 53 batch_start = 0 |
54 batch_end = 1 | 54 batch_end = 1 |
55 batch_size = get_size(entities[0]) | 55 batch_size = get_size(entities[0]) |
56 while batch_end < len(entities): | 56 while batch_end < len(entities): |
57 next_size = get_size(entities[batch_end]) | 57 next_size = get_size(entities[batch_end]) |
58 if (batch_size + next_size > _MAX_REQUEST_SIZE or | 58 if (batch_size + next_size > _MAX_REQUEST_SIZE or |
59 batch_end - batch_start >= _MAX_BATCH_SIZE): | 59 batch_end - batch_start >= _MAX_BATCH_SIZE): |
60 yield entities[batch_start:batch_end], batch_end, len(entities) | 60 yield entities[batch_start:batch_end], batch_end, len(entities) |
61 batch_start = batch_end | 61 batch_start = batch_end |
62 batch_size = 0 | 62 batch_size = 0 |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
97 Key('PersistentObjectStoreItem', 'animal/dog') | 97 Key('PersistentObjectStoreItem', 'animal/dog') |
98 Key('PersistentObjectStoreItem', 'animal/trolling cat') | 98 Key('PersistentObjectStoreItem', 'animal/trolling cat') |
99 | 99 |
100 If given |original_data|, this will only push key-value pairs for entries that | 100 If given |original_data|, this will only push key-value pairs for entries that |
101 are either new or have changed from their original (pickled) value. | 101 are either new or have changed from their original (pickled) value. |
102 | 102 |
103 Caveat: Pickling and unpickling a dictionary can (but does not always) change | 103 Caveat: Pickling and unpickling a dictionary can (but does not always) change |
104 its key order. This means that objects will often be seen as changed even when | 104 its key order. This means that objects will often be seen as changed even when |
105 they haven't changed. | 105 they haven't changed. |
106 ''' | 106 ''' |
107 datastore.set_options(dataset=_DATASET_NAME) | 107 client = datastore.Client(_PROJECT_NAME) |
108 | 108 |
109 def flatten(dataset): | 109 def flatten(dataset): |
110 flat = {} | 110 flat = {} |
111 for namespace, items in dataset.iteritems(): | 111 for namespace, items in dataset.iteritems(): |
112 for k, v in items.iteritems(): | 112 for k, v in items.iteritems(): |
113 flat['%s/%s' % (namespace, k)] = cPickle.dumps(v) | 113 flat['%s/%s' % (namespace, k)] = cPickle.dumps(v) |
114 return flat | 114 return flat |
115 | 115 |
116 logging.info('Flattening data sets...') | 116 logging.info('Flattening data sets...') |
117 data = flatten(data) | 117 data = flatten(data) |
118 original_data = flatten(original_data) | 118 original_data = flatten(original_data) |
119 | 119 |
120 logging.info('Culling new data...') | 120 logging.info('Culling new data...') |
121 for k in data.keys(): | 121 for k in data.keys(): |
122 if ((k in original_data and original_data[k] == data[k]) or | 122 if ((k in original_data and original_data[k] == data[k]) or |
123 (len(data[k]) > _MAX_ENTITY_SIZE)): | 123 (len(data[k]) > _MAX_ENTITY_SIZE)): |
124 del data[k] | 124 del data[k] |
125 | 125 |
126 for batch, n, total in _CreateBatches(data): | 126 for entities, n, total in _CreateBatches(client, data): |
127 commit_request = datastore.CommitRequest() | 127 batch = client.batch() |
128 commit_request.mode = datastore.CommitRequest.NON_TRANSACTIONAL | 128 for e in entities: |
129 commit_request.mutations.upsert.extend(list(batch)) | 129 batch.put(e) |
130 | |
131 logging.info('Committing %s/%s entities...' % (n, total)) | 130 logging.info('Committing %s/%s entities...' % (n, total)) |
132 datastore.commit(commit_request) | 131 batch.commit() |
OLD | NEW |