build_tools/filter_data_for_size.py - Issue 1000163003: Generate the icu data binaries at compile time instead of checking in binaries

Unified Diff: build_tools/filter_data_for_size.py

Issue 1000163003: Generate the icu data binaries at compile time instead of checking in binaries Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: Fixed warnings in cross compiling Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: build_tools/filter_data_for_size.py

diff --git a/build_tools/filter_data_for_size.py b/build_tools/filter_data_for_size.py

new file mode 100644

index 0000000000000000000000000000000000000000..249a509c8b2c287d9871521e8bfc3960c4647f60

--- /dev/null

+++ b/build_tools/filter_data_for_size.py

@@ -0,0 +1,449 @@

+"""Rewrites data so that it becomes smaller after compilation."""

+from __future__ import print_function

+import argparse

+import os

+import shutil

+def _mergetree(source_dir, dest_dir):

+ files = os.listdir(source_dir)

+ for f in files:

+ src = os.path.join(source_dir, f)

+ dest = os.path.join(dest_dir, f)

+ if os.path.isdir(src):

+ copytree(src, dest)

+ else:

+ shutil.copy2(src, dest)

+def copytree(source, dest):

+ """Like shutil.copytree but can handle that dest exists and

+ merge/overwrite the files there in that case."""

+ if not os.path.isdir(source):

+ shutil.copy2(source, dest)

+ elif os.path.isdir(dest):

+ _mergetree(source, dest)

+ else:

+ shutil.copytree(source, dest)

+def main():

+ parser = argparse.ArgumentParser(

+ description=('Generates an icudata.lst file to be compiled by icu.'))

+ parser.add_argument('--mode',

+ required=True,

+ choices=['clean-copy', 'strip-for-size'],

+ help='Whether the files should be changed or not.')

+ parser.add_argument('--in-word-txt',

+ required=True,

+ help='The word.txt to filter')

+ parser.add_argument('--out-word-txt',

+ required=True,

+ help='The word.txt to filter')

+ parser.add_argument('--in-brkitr-root-txt',

+ required=True,

+ help='The brkitr/root.txt to filter')

+ parser.add_argument('--out-brkitr-root-txt',

+ required=True,

+ help='The brkitr/root.txt to filter')

+ parser.add_argument('--in-brkitr-ja-txt',

+ required=True,

+ help='The brkitr/ja.txt to filter')

+ parser.add_argument('--out-brkitr-ja-txt',

+ required=True,

+ help='The brkitr/ja.txt to filter')

+ parser.add_argument('--currency-keep-list',

+ required=True,

+ help="The file with currencies to keep")

+ parser.add_argument('--in-curr-dir',

+ required=True,

+ help="The currency data files")

+ parser.add_argument('--out-curr-dir',

+ required=True,

+ help="The filtered/copied currency data files")

+ parser.add_argument('--minimize-language-list',

+ required=True,

+ help=("Comma separated list of languages (locales) " +

+ "to minimize the data for."))

+ parser.add_argument('--in-locales-dir',

+ required=True,

+ help="The locales data files")

+ parser.add_argument('--out-locales-dir',

+ required=True,

+ help="The filtered/copied locales data files")

+ parser.add_argument('--in-lang-dir',

+ required=True,

+ help="The lang data files")

+ parser.add_argument('--out-lang-dir',

+ required=True,

+ help="The filtered/copied lang data files")

+ parser.add_argument('--remove-data-already-existing-in-android',

+ action="store_true",

+ help=("Removes data in lang/region/... that " +

+ "can be fetched from Android APIs"))

+ parser.add_argument('--in-zone-dir',

+ required=True,

+ help="The zone data files")

+ parser.add_argument('--out-zone-dir',

+ required=True,

+ help="The filtered/copied zone data files")

+ args = parser.parse_args()

+ if args.mode == 'strip-for-size':

+ fix_word_txt(args.in_word_txt, args.out_word_txt)

+ fix_brkitr_root_txt(args.in_brkitr_root_txt, args.out_brkitr_root_txt)

+ fix_brkitr_ja_txt(args.in_brkitr_ja_txt, args.out_brkitr_ja_txt)

+ fix_currencies(args.currency_keep_list,

+ args.in_curr_dir,

+ args.out_curr_dir)

+ minimize_data_for_locales(args.in_locales_dir,

+ args.out_locales_dir,

+ args.minimize_language_list)

+ remove_zone_example_cities(args.in_zone_dir, args.out_zone_dir)

+ else:

+ assert args.mode == 'clean-copy'

+ shutil.copyfile(args.in_word_txt, args.out_word_txt)

+ shutil.copyfile(args.in_brkitr_root_txt, args.out_brkitr_root_txt)

+ shutil.copyfile(args.in_brkitr_root_txt, args.out_brkitr_ja_txt)

+ copytree(args.in_curr_dir, args.out_curr_dir)

+ copytree(args.in_locales_dir, args.out_locales_dir)

+ copytree(args.in_zone_dir, args.out_zone_dir)

+ if (args.mode == 'strip-for-size' and

+ args.remove_data_already_existing_in_android):

+ strip_android_langs(args.in_lang_dir, args.out_lang_dir)

+ else:

+ assert args.mode == 'clean-copy'

+ copytree(args.in_lang_dir, args.out_lang_dir)

+def fix_word_txt(source, dest):

+ """Strip all references to dictionaryCJK and KanaKanji and

+ HangulSymbal."""

+ with open(source) as in_file:

+ with open(dest, "w") as out:

+ _process_word_txt(in_file, out)

+WORD_TXT_REPLACEMENTS = (

+ ("$ALetter-$dictionaryCJK", "$ALetter"),

+ ("$ComplexContext $dictionaryCJK", "$ComplexContext"),

+ ("$dictionaryCJK", None),

+ ("special handling for CJK", None),

+ ("$HangulSyllable $HangulSyllable", None),

+ ("$KanaKanji $KanaKanji", None),

+def _process_word_txt(in_file, out):

+ for line in in_file:

+ for replacement_pattern, replace_with in WORD_TXT_REPLACEMENTS:

+ if replacement_pattern in line:

+ if replace_with is None:

+ # Delete line.

+ line = None

+ else:

+ line = line.replace(replacement_pattern,

+ replace_with)

+ break

+ if line is not None:

+ out.write(line)

+def fix_brkitr_root_txt(source, dest):

+ """Strip all references to cjdict."""

+ with open(source) as in_file:

+ with open(dest, "w") as out:

+ trigger1 = False

+ for line in in_file:

+ if "cjdict.dict" in line:

+ trigger1 = True

+ continue

+ out.write(line)

+ assert trigger1

+def fix_brkitr_ja_txt(source, dest):

+ """Strip all references to line_ja.brk."""

+ with open(source) as in_file:

+ with open(dest, "w") as out:

+ trigger1 = False

+ for line in in_file:

+ if 'line:process(dependency){"line_ja.brk"}' in line:

+ line = line.replace(

+ 'line:process(dependency){"line_ja.brk"}',

+ 'word:process(dependency){"word_ja.brk"}')

+ trigger1 = True

+ out.write(line)

+ assert trigger1

+def _print_block(line, in_file, out):

+ indentation_count = 0

+ for char in line:

+ if char != " ":

+ break

+ indentation_count += 1

+ indentation = " " * indentation_count

+ if out is not None:

+ out.write(line)

+ if "}" in line:

+ return

+ line = in_file.next()

+ while not line.startswith(indentation + "}"):

+ if out is not None:

+ out.write(line)

+ line = in_file.next()

+ if out is not None:

+ out.write(line)

+def _skip_block(line, in_file):

+ _print_block(line, in_file, None)

+def _print_or_skip_block(line, in_file, out, keep):

+ if not keep:

+ out = None

+ _print_block(line, in_file, out)

+def _block_name(line):

+ assert "{" in line

+ return line.split("{")[0].lstrip()

+def _copy_start_of_file(in_file, out, locale_name):

+ line = in_file.next()

+ while not line.startswith("%s{" % locale_name):

+ out.write(line)

+ line = in_file.next()

+ out.write(line)

+def fix_currencies(keep_list_file, in_curr_dir, out_curr_dir):

+ """Remove currencies that are not globally important."""

+ with open(keep_list_file) as list_file:

+ currencies_to_keep = set(list_file.read().split())

+ currency_files = os.listdir(in_curr_dir)

+ for currency_file in currency_files:

+ in_file_name = os.path.join(in_curr_dir, currency_file)

+ out_file_name = os.path.join(out_curr_dir, currency_file)

+ if (not currency_file.endswith(".txt") or

+ currency_file == "supplementalData.txt"):

+ # For instance pool.res.

+ shutil.copyfile(in_file_name, out_file_name)

+ continue

+ locale_name = os.path.splitext(currency_file)[0]

+ with open(in_file_name) as in_file:

+ with open(out_file_name, "w") as out:

+ _process_currency_file(in_file, out, locale_name,

+ currencies_to_keep)

+def _process_currency_file(in_file, out, locale_name, currencies_to_keep):

+ try:

+ _copy_start_of_file(in_file, out, locale_name)

+ while True:

+ line = in_file.next()

+ if "{" in line:

+ block_name = _block_name(line)

+ if block_name in ("Currencies",

+ "Currencies%narrow",

+ "CurrencyPlurals"):

+ # Keep only certain currencies.

+ out.write(line)

+ line = in_file.next()

+ while not line.startswith(" }"):

+ if "{" in line:

+ currency_name = _block_name(line)

+ keep_curr = currency_name in currencies_to_keep

+ _print_or_skip_block(line, in_file, out, keep_curr)

+ else:

+ out.write(line)

+ line = in_file.next()

+ out.write(line)

+ else:

+ keep = block_name in ('"%%ALIAS"',

+ "%%Parent",

+ "currencyMap",

+ "CurrencyMap",

+ "currencyMeta",

+ "CurrencyMeta",

+ "currencySpacing",

+ "CurrencySpacing",

+ "currencyUnitPatterns",

+ "CurrencyUnitPatterns",

+ "Version")

+ _print_or_skip_block(line, in_file, out, keep)

+ else:

+ out.write(line)

+ except StopIteration:

+ pass

+def is_interesting_calendar(locale_name, calendar_name):

+ if calendar_name in ("generic", "gregorian"):

+ return True

+ interesting_calendar_map = {

+ "th": "buddhist",

+ "zh": "chinese",

+ "zh_Hant": "roc",

+ "ko": "dangi",

+ "am": "ethiopic",

+ "he": "hebrew",

+ "ar": "arabic",

+ "fa": "persian",

+ "ja": "japanese",

+ }

+ locale_parts = locale_name.split("_")

+ for i in range(len(locale_parts)):

+ base_locale = "_".join(locale_parts[:(i+1)])

+ if calendar_name == interesting_calendar_map.get(base_locale):

+ return True

+ return False

+def minimize_data_for_locales(locales_in_dir, locales_out_dir,

+ languages_to_minimize_str):

+ languages_to_minimize = set(languages_to_minimize_str.split(","))

+ locale_files = os.listdir(locales_in_dir)

+ for locale_file in locale_files:

+ in_file_name = os.path.join(locales_in_dir, locale_file)

+ out_file_name = os.path.join(locales_out_dir, locale_file)

+ if not locale_file.endswith(".txt"):

+ # For instance pool.res.

+ shutil.copyfile(in_file_name, out_file_name)

+ continue

+ locale_name = os.path.splitext(locale_file)[0]

+ with open(in_file_name) as in_file:

+ with open(out_file_name, "w") as out:

+ _process_locale_file(in_file, out, locale_name,

+ languages_to_minimize)

+def _process_locale_file(in_file, out, locale_name, languages_to_minimize):

+ try:

+ _copy_start_of_file(in_file, out, locale_name)

+ while True: # Until StopIteration exception.

+ # Keep all blocks after "Version" but only a few

+ # blocks before that.

+ line = in_file.next()

+ if "{" in line:

+ block_name = _block_name(line)

+ if (not locale_name in languages_to_minimize and

+ block_name == "calendar"):

+ out.write(line)

+ line = in_file.next()

+ while not line.startswith(" }"):

+ if "{" in line:

+ calendar_name = _block_name(line)

+ keep_cal = is_interesting_calendar(locale_name,

+ calendar_name)

+ _print_or_skip_block(line, in_file, out, keep_cal)

+ else:

+ out.write(line)

+ line = in_file.next()

+ out.write(line)

+ continue

+ # Do not include '%%Parent' line on purpose.

+ keep = (not locale_name in languages_to_minimize or

+ block_name in (

+ '"%%ALIAS"',

+ "AuxExemplarCharacters",

+ "ExemplarCharacters",

+ "LocaleScript",

+ "layout",

+ "Version"))

+ _print_or_skip_block(line, in_file, out, keep)

+ else:

+ out.write(line)

+ except StopIteration:

+ pass

+ # Note: patch_locale.sh also has code to strip only some calendars

+ # but since the calendar block is removed above that makes no

+ # difference.

+def strip_android_langs(in_lang_dir, out_lang_dir):

+ lang_files = os.listdir(in_lang_dir)

+ for lang_file in lang_files:

+ in_file_name = os.path.join(in_lang_dir, lang_file)

+ out_file_name = os.path.join(out_lang_dir, lang_file)

+ if not lang_file.endswith(".txt"):

+ # For instance pool.res

+ shutil.copyfile(in_file_name, out_file_name)

+ continue

+ locale_name = os.path.splitext(lang_file)[0]

+ with open(in_file_name) as in_file:

+ with open(out_file_name, "w") as out:

+ _process_lang_file(in_file, out, locale_name)

+def _process_lang_file(in_file, out, locale_name):

+ try:

+ _copy_start_of_file(in_file, out, locale_name)

+ while True: # Until StopIteration exception.

+ line = in_file.next()

+ if "{" in line:

+ block_name = _block_name(line)

+ if block_name in ("Languages", "Scripts"):

+ out.write(line)

+ parts_to_keep = {

+ "Languages": ("zh{",),

+ "Scripts": ("Hans{", "Hant{") }[block_name]

+ line = in_file.next()

+ while not line.startswith(" }"):

+ if line.strip().startswith(parts_to_keep):

+ out.write(line)

+ line = in_file.next()

+ out.write(line)

+ else:

+ keep = True

+ if block_name in ("Keys",

+ "LanguagesShort",

+ "Types",

+ "Variants",

+ "calendar",

+ "codePatterns",

+ "localeDisplayPattern"):

+ # Delete the whole block.

+ keep = False

+ _print_or_skip_block(line, in_file, out, keep)

+ else:

+ out.write(line)

+ except StopIteration:

+ pass

+def remove_zone_example_cities(in_zone_dir, out_zone_dir):

+ zone_files = os.listdir(in_zone_dir)

+ for zone_file in zone_files:

+ locale_name = os.path.splitext(zone_file)[0]

+ with open(os.path.join(in_zone_dir, zone_file)) as in_file:

+ with open(os.path.join(out_zone_dir, zone_file), "w") as out:

+ # Strip everything from zoneStrings to the first line with

+ # a "meta:" name.

+ pos = 0 # 0 = start -> keep, 1 = mid -> delete, 2 = end -> keep

+ if locale_name == "root":

+ pos = 2 # Keep everything in root.txt.

+ for line in in_file:

+ if pos == 0:

+ out.write(line)

+ if line.startswith(" zoneStrings"):

+ pos = 1

+ if pos == 1 and line.startswith(" \"meta:"):

+ pos = 2

+ if pos == 2:

+ out.write(line)

+if __name__ == '__main__':

+ main()

« no previous file with comments | « build_tools/copy_rename_file.py ('k') | build_tools/filter_data_for_size_unittest.py » ('j') | no next file with comments »