tools/grit/grit/format/data_pack.py - Issue 2969123002: Add deduplication logic to .pak files

Side by Side Diff: tools/grit/grit/format/data_pack.py

Issue 2969123002: Add deduplication logic to .pak files (Closed)

Patch Set: fix resource_sizes computation Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 """Support for formatting a data pack file used for platform agnostic resource	6 """Support for formatting a data pack file used for platform agnostic resource

7 files.	7 files.

8 """	8 """

9	9

10 import collections	10 import collections

11 import exceptions	11 import exceptions

12 import os	12 import os

13 import struct	13 import struct

14 import sys	14 import sys

15 if __name__ == '__main__':	15 if __name__ == '__main__':

16 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))	16 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

17	17

18 from grit import util	18 from grit import util

19 from grit.node import include	19 from grit.node import include

20 from grit.node import message	20 from grit.node import message

21 from grit.node import structure	21 from grit.node import structure

22	22

23	23

24 PACK_FILE_VERSION = 4	24 PACK_FILE_VERSION = 5

25 HEADER_LENGTH = 2 * 4 + 1 # Two uint32s. (file version, number of entries) and

26 # one uint8 (encoding of text resources)

27 BINARY, UTF8, UTF16 = range(3)	25 BINARY, UTF8, UTF16 = range(3)

28	26

29	27

30 class WrongFileVersion(Exception):	28 class WrongFileVersion(Exception):

31 pass	29 pass

32	30

33	31

	32 class CorruptDataPack(Exception):

	33 pass

	34

	35

34 DataPackContents = collections.namedtuple(	36 DataPackContents = collections.namedtuple(

35 'DataPackContents', 'resources encoding')	37 'DataPackContents', 'resources encoding')

36	38

37	39

38 def Format(root, lang='en', output_dir='.'):	40 def Format(root, lang='en', output_dir='.'):

39 """Writes out the data pack file format (platform agnostic resource file)."""	41 """Writes out the data pack file format (platform agnostic resource file)."""

40 data = {}	42 data = {}

41 for node in root.ActiveDescendants():	43 for node in root.ActiveDescendants():

42 with node:	44 with node:

43 if isinstance(node, (include.IncludeNode, message.MessageNode,	45 if isinstance(node, (include.IncludeNode, message.MessageNode,

44 structure.StructureNode)):	46 structure.StructureNode)):

45 id, value = node.GetDataPackPair(lang, UTF8)	47 id, value = node.GetDataPackPair(lang, UTF8)

46 if value is not None:	48 if value is not None:

47 data[id] = value	49 data[id] = value

48 return WriteDataPackToString(data, UTF8)	50 return WriteDataPackToString(data, UTF8)

49	51

50	52

51 def ReadDataPack(input_file):	53 def ReadDataPack(input_file):

	54 return ReadDataPackFromString(util.ReadFile(input_file, util.BINARY))

	55

	56

	57 def ReadDataPackFromString(data):

52 """Reads a data pack file and returns a dictionary."""	58 """Reads a data pack file and returns a dictionary."""

53 data = util.ReadFile(input_file, util.BINARY)

54 original_data = data	59 original_data = data

55	60

56 # Read the header.	61 # Read the header.

57 version, num_entries, encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])	62 version = struct.unpack('<H', data[:2])[0]

58 if version != PACK_FILE_VERSION:	63 if version == 4:

59 print 'Wrong file version in ', input_file	64 resource_count, encoding = struct.unpack('<IB', data[4:9])

60 raise WrongFileVersion	65 alias_count = 0

	66 data = data[9:]

	67 elif version == 5:

	68 encoding, resource_count, alias_count = struct.unpack('<HHH', data[2:8])

	69 data = data[8:]

	70 else:

	71 raise WrongFileVersion('Found version: ' + str(version))

61	72

62 resources = {}	73 resources = {}

63 if num_entries == 0:	74 kIndexEntrySize = 2 + 4 # Each entry is a uint16 and a uint32.

64 return DataPackContents(resources, encoding)	75 def entry_at_index(idx):

	76 offset = idx * kIndexEntrySize

	77 return struct.unpack('<HI', data[offset:offset + kIndexEntrySize])

65	78

66 # Read the index and data.	79 # Read the main table in reverse so that prev_offset > offset.

67 data = data[HEADER_LENGTH:]	80 prev_offset = entry_at_index(resource_count)[1]
	flackr 2017/07/07 18:54:12 It's a little strange to not read in a forwards di It's a little strange to not read in a forwards direction. I think it might be cleaner to store cur_entry and for each next_entry (i.e. 1 to count) add the cur_entry. agrieve 2017/07/07 20:47:08 ah, yeah, that's nicer :) Show quoted text On 2017/07/07 18:54:12, flackr wrote: > It's a little strange to not read in a forwards direction. I think it might be > cleaner to store cur_entry and for each next_entry (i.e. 1 to count) add the > cur_entry. ah, yeah, that's nicer :)
68 kIndexEntrySize = 2 + 4 # Each entry is a uint16 and a uint32.	81 for i in xrange(resource_count - 1, -1, -1):

69 for _ in range(num_entries):	82 resource_id, offset = entry_at_index(i)

70 id, offset = struct.unpack('<HI', data[:kIndexEntrySize])	83 resources[resource_id] = original_data[offset:prev_offset]

71 data = data[kIndexEntrySize:]	84 prev_offset = offset

72 next_id, next_offset = struct.unpack('<HI', data[:kIndexEntrySize])	85

73 resources[id] = original_data[offset:next_offset]	86 # Read the alias table.

	87 alias_data = data[(resource_count + 1) * kIndexEntrySize:]

	88 kAliasEntrySize = 2 + 2 # uint16, uint16

	89 def alias_at_index(idx):

	90 offset = idx * kAliasEntrySize

	91 return struct.unpack('<HH', alias_data[offset:offset + kAliasEntrySize])

	92

	93 for i in xrange(alias_count):

	94 resource_id, index = alias_at_index(i)

	95 aliased_id = entry_at_index(index)[0]

	96 resources[resource_id] = resources[aliased_id]

74	97

75 return DataPackContents(resources, encoding)	98 return DataPackContents(resources, encoding)

76	99

77	100

78 def WriteDataPackToString(resources, encoding):	101 def WriteDataPackToString(resources, encoding):

79 """Returns a string with a map of id=>data in the data pack format."""	102 """Returns a string with a map of id=>data in the data pack format."""

80 ids = sorted(resources.keys())

81 ret = []	103 ret = []

82	104

	105 # Compute alias map.

	106 resource_ids = sorted(resources)

	107 id_by_data = {resources[k]: k for k in reversed(resource_ids)}
	flackr 2017/07/07 18:54:12 Comment why we use reversed - presumably want lowe Comment why we use reversed - presumably want lowest k which resource maps to. agrieve 2017/07/07 20:47:08 Done. Show quoted text On 2017/07/07 18:54:12, flackr wrote: > Comment why we use reversed - presumably want lowest k which resource maps to. Done.
	108 # Map of resource_id -> resource_id, where value < key.

	109 alias_map = {k: id_by_data[v] for k, v in resources.iteritems()

	110 if id_by_data[v] != k}

	111

83 # Write file header.	112 # Write file header.

84 ret.append(struct.pack('<IIB', PACK_FILE_VERSION, len(ids), encoding))	113 resource_count = len(resources) - len(alias_map)

85 HEADER_LENGTH = 2 * 4 + 1 # Two uint32s and one uint8.	114 # Note: 2nd and 4th byte are always 0 since version and encoding < 256.
	flackr 2017/07/07 18:54:12 Version I could see may someday be > 256, I can ju Version I could see may someday be > 256, I can just imagine the day when we're sad to have to make a breaking change because we can't create grit files with version 65536. But why grow encoding to a short? agrieve 2017/07/07 20:47:08 Expanded the comment. I just speculated that thing Show quoted text On 2017/07/07 18:54:12, flackr wrote: > Version I could see may someday be > 256, I can just imagine the day when we're > sad to have to make a breaking change because we can't create grit files with > version 65536. But why grow encoding to a short? Expanded the comment. I just speculated that things may go faster if the tables are not aligned on odd-numbered bytes. flackr 2017/07/10 14:07:49 I suppose that could help, but it's better IMO to Show quoted text On 2017/07/07 20:47:08, agrieve wrote: > On 2017/07/07 18:54:12, flackr wrote: > > Version I could see may someday be > 256, I can just imagine the day when > we're > > sad to have to make a breaking change because we can't create grit files with > > version 65536. But why grow encoding to a short? > > Expanded the comment. I just speculated that things may go faster if the tables > are not aligned on odd-numbered bytes. I suppose that could help, but it's better IMO to insert an intentional gap so that we know there is a gap there later if/when we need to add something else. agrieve 2017/07/18 19:30:29 Done. Show quoted text On 2017/07/10 14:07:49, flackr wrote: > On 2017/07/07 20:47:08, agrieve wrote: > > On 2017/07/07 18:54:12, flackr wrote: > > > Version I could see may someday be > 256, I can just imagine the day when > > we're > > > sad to have to make a breaking change because we can't create grit files > with > > > version 65536. But why grow encoding to a short? > > > > Expanded the comment. I just speculated that things may go faster if the > tables > > are not aligned on odd-numbered bytes. > > I suppose that could help, but it's better IMO to insert an intentional gap so > that we know there is a gap there later if/when we need to add something else. Done.
	115 ret.append(struct.pack('<HHHH', PACK_FILE_VERSION, encoding,

	116 resource_count, len(alias_map)))

	117 HEADER_LENGTH = 2 + 2 + 2 + 2

86	118

87 # Each entry is a uint16 + a uint32s. We have one extra entry for the last	119 # Each main table entry is a uint16 + a uint32.
	flackr 2017/07/07 18:54:12 Preserve the comment that we have an extra entry f Preserve the comment that we have an extra entry for the last item. agrieve 2017/07/07 20:47:08 Done. Show quoted text On 2017/07/07 18:54:12, flackr wrote: > Preserve the comment that we have an extra entry for the last item. Done.
88 # item.	120 # Each alias table entry is a uint16 + a uint16.

89 index_length = (len(ids) + 1) * (2 + 4)	121 data_offset = HEADER_LENGTH + (resource_count + 1) * 6 + len(alias_map) * 4

90	122

91 # Write index.	123 # Write main table.

92 data_offset = HEADER_LENGTH + index_length	124 index_by_id = {}

93 for id in ids:	125 deduped_data = []

94 ret.append(struct.pack('<HI', id, data_offset))	126 index = 0

95 data_offset += len(resources[id])	127 for resource_id in resource_ids:

	128 if resource_id in alias_map:

	129 continue

	130 data = resources[resource_id]

	131 index_by_id[resource_id] = index

	132 ret.append(struct.pack('<HI', resource_id, data_offset))

	133 data_offset += len(data)

	134 deduped_data.append(data)

	135 index += 1

96	136

	137 assert index == resource_count

	138 # Add an extra entry at the end.

97 ret.append(struct.pack('<HI', 0, data_offset))	139 ret.append(struct.pack('<HI', 0, data_offset))

98	140

	141 # Write alias table.

	142 for resource_id in sorted(alias_map):

	143 index = index_by_id[alias_map[resource_id]]

	144 ret.append(struct.pack('<HH', resource_id, index))

	145

99 # Write data.	146 # Write data.

100 for id in ids:	147 ret.extend(deduped_data)

101 ret.append(resources[id])

102 return ''.join(ret)	148 return ''.join(ret)

103	149

104	150

105 def WriteDataPack(resources, output_file, encoding):	151 def WriteDataPack(resources, output_file, encoding):

106 """Writes a map of id=>data into output_file as a data pack."""	152 """Writes a map of id=>data into output_file as a data pack."""

107 content = WriteDataPackToString(resources, encoding)	153 content = WriteDataPackToString(resources, encoding)

108 with open(output_file, 'wb') as file:	154 with open(output_file, 'wb') as file:

109 file.write(content)	155 file.write(content)

110	156

111	157

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
202 # Write a simple file.	248 # Write a simple file.

203 data = {1: '', 4: 'this is id 4', 6: 'this is id 6', 10: ''}	249 data = {1: '', 4: 'this is id 4', 6: 'this is id 6', 10: ''}

204 WriteDataPack(data, 'datapack1.pak', UTF8)	250 WriteDataPack(data, 'datapack1.pak', UTF8)

205 data2 = {1000: 'test', 5: 'five'}	251 data2 = {1000: 'test', 5: 'five'}

206 WriteDataPack(data2, 'datapack2.pak', UTF8)	252 WriteDataPack(data2, 'datapack2.pak', UTF8)

207 print 'wrote datapack1 and datapack2 to current directory.'	253 print 'wrote datapack1 and datapack2 to current directory.'

208	254

209	255

210 if __name__ == '__main__':	256 if __name__ == '__main__':

211 main()	257 main()

OLD	NEW

« no previous file with comments | « build/android/resource_sizes.py ('k') | tools/grit/grit/format/data_pack_unittest.py » ('j') | tools/grit/grit/format/data_pack_unittest.py » ('J')