tools/grit/grit/format/data_pack.py - Issue 2969123002: Add deduplication logic to .pak files

Side by Side Diff: tools/grit/grit/format/data_pack.py

Issue 2969123002: Add deduplication logic to .pak files (Closed)

Patch Set: sizeof() Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 """Support for formatting a data pack file used for platform agnostic resource	6 """Support for formatting a data pack file used for platform agnostic resource

7 files.	7 files.

8 """	8 """

9	9

10 import collections	10 import collections

11 import exceptions	11 import exceptions

12 import os	12 import os

13 import struct	13 import struct

14 import sys	14 import sys

15 if __name__ == '__main__':	15 if __name__ == '__main__':

16 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))	16 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

17	17

18 from grit import util	18 from grit import util

19 from grit.node import include	19 from grit.node import include

20 from grit.node import message	20 from grit.node import message

21 from grit.node import structure	21 from grit.node import structure

22	22

23	23

24 PACK_FILE_VERSION = 4	24 PACK_FILE_VERSION = 5

25 HEADER_LENGTH = 2 * 4 + 1 # Two uint32s. (file version, number of entries) and

26 # one uint8 (encoding of text resources)

27 BINARY, UTF8, UTF16 = range(3)	25 BINARY, UTF8, UTF16 = range(3)

28	26

29	27

30 class WrongFileVersion(Exception):	28 class WrongFileVersion(Exception):

31 pass	29 pass

32	30

33	31

	32 class CorruptDataPack(Exception):

	33 pass

	34

	35

34 DataPackContents = collections.namedtuple(	36 DataPackContents = collections.namedtuple(

35 'DataPackContents', 'resources encoding')	37 'DataPackContents', 'resources encoding')

36	38

37	39

38 def Format(root, lang='en', output_dir='.'):	40 def Format(root, lang='en', output_dir='.'):

39 """Writes out the data pack file format (platform agnostic resource file)."""	41 """Writes out the data pack file format (platform agnostic resource file)."""

40 data = {}	42 data = {}

41 for node in root.ActiveDescendants():	43 for node in root.ActiveDescendants():

42 with node:	44 with node:

43 if isinstance(node, (include.IncludeNode, message.MessageNode,	45 if isinstance(node, (include.IncludeNode, message.MessageNode,

44 structure.StructureNode)):	46 structure.StructureNode)):

45 id, value = node.GetDataPackPair(lang, UTF8)	47 id, value = node.GetDataPackPair(lang, UTF8)

46 if value is not None:	48 if value is not None:

47 data[id] = value	49 data[id] = value

48 return WriteDataPackToString(data, UTF8)	50 return WriteDataPackToString(data, UTF8)

49	51

50	52

51 def ReadDataPack(input_file):	53 def ReadDataPack(input_file):

	54 return ReadDataPackFromString(util.ReadFile(input_file, util.BINARY))

	55

	56

	57 def ReadDataPackFromString(data):

52 """Reads a data pack file and returns a dictionary."""	58 """Reads a data pack file and returns a dictionary."""

53 data = util.ReadFile(input_file, util.BINARY)

54 original_data = data	59 original_data = data

55	60

56 # Read the header.	61 # Read the header.

57 version, num_entries, encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])	62 version = struct.unpack('<I', data[:4])[0]

58 if version != PACK_FILE_VERSION:	63 if version == 4:

59 print 'Wrong file version in ', input_file	64 resource_count, encoding = struct.unpack('<IB', data[4:9])

60 raise WrongFileVersion	65 alias_count = 0

	66 data = data[9:]

	67 elif version == 5:

	68 encoding, resource_count, alias_count = struct.unpack('<BxxxHH', data[4:12])

	69 data = data[12:]

	70 else:

	71 raise WrongFileVersion('Found version: ' + str(version))

61	72

62 resources = {}	73 resources = {}

63 if num_entries == 0:	74 kIndexEntrySize = 2 + 4 # Each entry is a uint16 and a uint32.

64 return DataPackContents(resources, encoding)	75 def entry_at_index(idx):

	76 offset = idx * kIndexEntrySize

	77 return struct.unpack('<HI', data[offset:offset + kIndexEntrySize])

65	78

66 # Read the index and data.	79 prev_resource_id, prev_offset = entry_at_index(0)

67 data = data[HEADER_LENGTH:]	80 for i in xrange(1, resource_count + 1):

68 kIndexEntrySize = 2 + 4 # Each entry is a uint16 and a uint32.	81 resource_id, offset = entry_at_index(i)

69 for _ in range(num_entries):	82 resources[prev_resource_id] = original_data[prev_offset:offset]

70 id, offset = struct.unpack('<HI', data[:kIndexEntrySize])	83 prev_resource_id, prev_offset = resource_id, offset

71 data = data[kIndexEntrySize:]	84

72 next_id, next_offset = struct.unpack('<HI', data[:kIndexEntrySize])	85 # Read the alias table.

73 resources[id] = original_data[offset:next_offset]	86 alias_data = data[(resource_count + 1) * kIndexEntrySize:]

	87 kAliasEntrySize = 2 + 2 # uint16, uint16

	88 def alias_at_index(idx):

	89 offset = idx * kAliasEntrySize

	90 return struct.unpack('<HH', alias_data[offset:offset + kAliasEntrySize])

	91

	92 for i in xrange(alias_count):

	93 resource_id, index = alias_at_index(i)

	94 aliased_id = entry_at_index(index)[0]

	95 resources[resource_id] = resources[aliased_id]

74	96

75 return DataPackContents(resources, encoding)	97 return DataPackContents(resources, encoding)

76	98

77	99

78 def WriteDataPackToString(resources, encoding):	100 def WriteDataPackToString(resources, encoding):

79 """Returns a string with a map of id=>data in the data pack format."""	101 """Returns a string with a map of id=>data in the data pack format."""

80 ids = sorted(resources.keys())

81 ret = []	102 ret = []

82	103

	104 # Compute alias map.

	105 resource_ids = sorted(resources)

	106 # Use reversed() so that for duplicates lower IDs clobber higher ones.

	107 id_by_data = {resources[k]: k for k in reversed(resource_ids)}

	108 # Map of resource_id -> resource_id, where value < key.

	109 alias_map = {k: id_by_data[v] for k, v in resources.iteritems()

	110 if id_by_data[v] != k}

	111

83 # Write file header.	112 # Write file header.

84 ret.append(struct.pack('<IIB', PACK_FILE_VERSION, len(ids), encoding))	113 resource_count = len(resources) - len(alias_map)

85 HEADER_LENGTH = 2 * 4 + 1 # Two uint32s and one uint8.	114 # Padding bytes added for alignment.

	115 ret.append(struct.pack('<IBxxxHH', PACK_FILE_VERSION, encoding,

	116 resource_count, len(alias_map)))

	117 HEADER_LENGTH = 4 + 4 + 2 + 2

86	118

87 # Each entry is a uint16 + a uint32s. We have one extra entry for the last	119 # Each main table entry is: uint16 + uint32 (and an extra entry at the end).

88 # item.	120 # Each alias table entry is: uint16 + uint16.

89 index_length = (len(ids) + 1) * (2 + 4)	121 data_offset = HEADER_LENGTH + (resource_count + 1) * 6 + len(alias_map) * 4

90	122

91 # Write index.	123 # Write main table.

92 data_offset = HEADER_LENGTH + index_length	124 index_by_id = {}

93 for id in ids:	125 deduped_data = []

94 ret.append(struct.pack('<HI', id, data_offset))	126 index = 0

95 data_offset += len(resources[id])	127 for resource_id in resource_ids:

	128 if resource_id in alias_map:

	129 continue

	130 data = resources[resource_id]

	131 index_by_id[resource_id] = index

	132 ret.append(struct.pack('<HI', resource_id, data_offset))

	133 data_offset += len(data)

	134 deduped_data.append(data)

	135 index += 1

96	136

	137 assert index == resource_count

	138 # Add an extra entry at the end.

97 ret.append(struct.pack('<HI', 0, data_offset))	139 ret.append(struct.pack('<HI', 0, data_offset))

98	140

	141 # Write alias table.

	142 for resource_id in sorted(alias_map):

	143 index = index_by_id[alias_map[resource_id]]

	144 ret.append(struct.pack('<HH', resource_id, index))

	145

99 # Write data.	146 # Write data.

100 for id in ids:	147 ret.extend(deduped_data)

101 ret.append(resources[id])

102 return ''.join(ret)	148 return ''.join(ret)

103	149

104	150

105 def WriteDataPack(resources, output_file, encoding):	151 def WriteDataPack(resources, output_file, encoding):

106 """Writes a map of id=>data into output_file as a data pack."""	152 """Writes a map of id=>data into output_file as a data pack."""

107 content = WriteDataPackToString(resources, encoding)	153 content = WriteDataPackToString(resources, encoding)

108 with open(output_file, 'wb') as file:	154 with open(output_file, 'wb') as file:

109 file.write(content)	155 file.write(content)

110	156

111	157

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
202 # Write a simple file.	248 # Write a simple file.

203 data = {1: '', 4: 'this is id 4', 6: 'this is id 6', 10: ''}	249 data = {1: '', 4: 'this is id 4', 6: 'this is id 6', 10: ''}

204 WriteDataPack(data, 'datapack1.pak', UTF8)	250 WriteDataPack(data, 'datapack1.pak', UTF8)

205 data2 = {1000: 'test', 5: 'five'}	251 data2 = {1000: 'test', 5: 'five'}

206 WriteDataPack(data2, 'datapack2.pak', UTF8)	252 WriteDataPack(data2, 'datapack2.pak', UTF8)

207 print 'wrote datapack1 and datapack2 to current directory.'	253 print 'wrote datapack1 and datapack2 to current directory.'

208	254

209	255

210 if __name__ == '__main__':	256 if __name__ == '__main__':

211 main()	257 main()

OLD	NEW

« no previous file with comments | « build/android/resource_sizes.py ('k') | tools/grit/grit/format/data_pack_unittest.py » ('j') | no next file with comments »