tools/grit/grit/format/data_pack.py - Issue 2969123002: Add deduplication logic to .pak files

Unified Diff: tools/grit/grit/format/data_pack.py

Issue 2969123002: Add deduplication logic to .pak files (Closed)

Patch Set: fix resource_sizes computation Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: tools/grit/grit/format/data_pack.py

diff --git a/tools/grit/grit/format/data_pack.py b/tools/grit/grit/format/data_pack.py

index f9bfc845f11b78ac8f69294026dd7233efa479d6..dd9ec170a16a9d7494da6a662a9fd08ae12b5c07 100755

--- a/tools/grit/grit/format/data_pack.py

+++ b/tools/grit/grit/format/data_pack.py

@@ -21,9 +21,7 @@ from grit.node import message

from grit.node import structure

-PACK_FILE_VERSION = 4

-HEADER_LENGTH = 2 * 4 + 1 # Two uint32s. (file version, number of entries) and

- # one uint8 (encoding of text resources)

+PACK_FILE_VERSION = 5

BINARY, UTF8, UTF16 = range(3)

@@ -31,6 +29,10 @@ class WrongFileVersion(Exception):

pass

+class CorruptDataPack(Exception):

+ pass

DataPackContents = collections.namedtuple(

'DataPackContents', 'resources encoding')

@@ -49,56 +51,100 @@ def Format(root, lang='en', output_dir='.'):

def ReadDataPack(input_file):

+ return ReadDataPackFromString(util.ReadFile(input_file, util.BINARY))

+def ReadDataPackFromString(data):

"""Reads a data pack file and returns a dictionary."""

- data = util.ReadFile(input_file, util.BINARY)

original_data = data

# Read the header.

- version, num_entries, encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])

- if version != PACK_FILE_VERSION:

- print 'Wrong file version in ', input_file

- raise WrongFileVersion

+ version = struct.unpack('<H', data[:2])[0]

+ if version == 4:

+ resource_count, encoding = struct.unpack('<IB', data[4:9])

+ alias_count = 0

+ data = data[9:]

+ elif version == 5:

+ encoding, resource_count, alias_count = struct.unpack('<HHH', data[2:8])

+ data = data[8:]

+ else:

+ raise WrongFileVersion('Found version: ' + str(version))

resources = {}

- if num_entries == 0:

- return DataPackContents(resources, encoding)

- # Read the index and data.

- data = data[HEADER_LENGTH:]

kIndexEntrySize = 2 + 4 # Each entry is a uint16 and a uint32.

- for _ in range(num_entries):

- id, offset = struct.unpack('<HI', data[:kIndexEntrySize])

- data = data[kIndexEntrySize:]

- next_id, next_offset = struct.unpack('<HI', data[:kIndexEntrySize])

- resources[id] = original_data[offset:next_offset]

+ def entry_at_index(idx):

+ offset = idx * kIndexEntrySize

+ return struct.unpack('<HI', data[offset:offset + kIndexEntrySize])

+ # Read the main table in reverse so that prev_offset > offset.

+ prev_offset = entry_at_index(resource_count)[1]

flackr 2017/07/07 18:54:12 It's a little strange to not read in a forwards di

agrieve 2017/07/07 20:47:08 ah, yeah, that's nicer :)

+ for i in xrange(resource_count - 1, -1, -1):

+ resource_id, offset = entry_at_index(i)

+ resources[resource_id] = original_data[offset:prev_offset]

+ prev_offset = offset

+ # Read the alias table.

+ alias_data = data[(resource_count + 1) * kIndexEntrySize:]

+ kAliasEntrySize = 2 + 2 # uint16, uint16

+ def alias_at_index(idx):

+ offset = idx * kAliasEntrySize

+ return struct.unpack('<HH', alias_data[offset:offset + kAliasEntrySize])

+ for i in xrange(alias_count):

+ resource_id, index = alias_at_index(i)

+ aliased_id = entry_at_index(index)[0]

+ resources[resource_id] = resources[aliased_id]

return DataPackContents(resources, encoding)

def WriteDataPackToString(resources, encoding):

"""Returns a string with a map of id=>data in the data pack format."""

- ids = sorted(resources.keys())

ret = []

- # Write file header.

- ret.append(struct.pack('<IIB', PACK_FILE_VERSION, len(ids), encoding))

- HEADER_LENGTH = 2 * 4 + 1 # Two uint32s and one uint8.

- # Each entry is a uint16 + a uint32s. We have one extra entry for the last

- # item.

- index_length = (len(ids) + 1) * (2 + 4)

- # Write index.

- data_offset = HEADER_LENGTH + index_length

- for id in ids:

- ret.append(struct.pack('<HI', id, data_offset))

- data_offset += len(resources[id])

+ # Compute alias map.

+ resource_ids = sorted(resources)

+ id_by_data = {resources[k]: k for k in reversed(resource_ids)}

flackr 2017/07/07 18:54:12 Comment why we use reversed - presumably want lowe

agrieve 2017/07/07 20:47:08 Done.

+ # Map of resource_id -> resource_id, where value < key.

+ alias_map = {k: id_by_data[v] for k, v in resources.iteritems()

+ if id_by_data[v] != k}

+ # Write file header.

+ resource_count = len(resources) - len(alias_map)

+ # Note: 2nd and 4th byte are always 0 since version and encoding < 256.

flackr 2017/07/07 18:54:12 Version I could see may someday be > 256, I can ju

agrieve 2017/07/07 20:47:08 Expanded the comment. I just speculated that thing

flackr 2017/07/10 14:07:49 I suppose that could help, but it's better IMO to

agrieve 2017/07/18 19:30:29 Done.

+ ret.append(struct.pack('<HHHH', PACK_FILE_VERSION, encoding,

+ resource_count, len(alias_map)))

+ HEADER_LENGTH = 2 + 2 + 2 + 2

+ # Each main table entry is a uint16 + a uint32.

flackr 2017/07/07 18:54:12 Preserve the comment that we have an extra entry f

agrieve 2017/07/07 20:47:08 Done.

+ # Each alias table entry is a uint16 + a uint16.

+ data_offset = HEADER_LENGTH + (resource_count + 1) * 6 + len(alias_map) * 4

+ # Write main table.

+ index_by_id = {}

+ deduped_data = []

+ index = 0

+ for resource_id in resource_ids:

+ if resource_id in alias_map:

+ continue

+ data = resources[resource_id]

+ index_by_id[resource_id] = index

+ ret.append(struct.pack('<HI', resource_id, data_offset))

+ data_offset += len(data)

+ deduped_data.append(data)

+ index += 1

+ assert index == resource_count

+ # Add an extra entry at the end.

ret.append(struct.pack('<HI', 0, data_offset))

+ # Write alias table.

+ for resource_id in sorted(alias_map):

+ index = index_by_id[alias_map[resource_id]]

+ ret.append(struct.pack('<HH', resource_id, index))

# Write data.

- for id in ids:

- ret.append(resources[id])

+ ret.extend(deduped_data)

return ''.join(ret)

« no previous file with comments | « build/android/resource_sizes.py ('k') | tools/grit/grit/format/data_pack_unittest.py » ('j') | tools/grit/grit/format/data_pack_unittest.py » ('J')