| Index: build_tools/filter_data_for_size.py
|
| diff --git a/build_tools/filter_data_for_size.py b/build_tools/filter_data_for_size.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..249a509c8b2c287d9871521e8bfc3960c4647f60
|
| --- /dev/null
|
| +++ b/build_tools/filter_data_for_size.py
|
| @@ -0,0 +1,449 @@
|
| +"""Rewrites data so that it becomes smaller after compilation."""
|
| +
|
| +from __future__ import print_function
|
| +
|
| +import argparse
|
| +import os
|
| +import shutil
|
| +
|
| +def _mergetree(source_dir, dest_dir):
|
| + files = os.listdir(source_dir)
|
| + for f in files:
|
| + src = os.path.join(source_dir, f)
|
| + dest = os.path.join(dest_dir, f)
|
| + if os.path.isdir(src):
|
| + copytree(src, dest)
|
| + else:
|
| + shutil.copy2(src, dest)
|
| +
|
| +def copytree(source, dest):
|
| + """Like shutil.copytree but can handle that dest exists and
|
| + merge/overwrite the files there in that case."""
|
| + if not os.path.isdir(source):
|
| + shutil.copy2(source, dest)
|
| + elif os.path.isdir(dest):
|
| + _mergetree(source, dest)
|
| + else:
|
| + shutil.copytree(source, dest)
|
| +
|
| +def main():
|
| + parser = argparse.ArgumentParser(
|
| + description=('Generates an icudata.lst file to be compiled by icu.'))
|
| +
|
| + parser.add_argument('--mode',
|
| + required=True,
|
| + choices=['clean-copy', 'strip-for-size'],
|
| + help='Whether the files should be changed or not.')
|
| +
|
| + parser.add_argument('--in-word-txt',
|
| + required=True,
|
| + help='The word.txt to filter')
|
| +
|
| + parser.add_argument('--out-word-txt',
|
| + required=True,
|
| + help='The word.txt to filter')
|
| +
|
| + parser.add_argument('--in-brkitr-root-txt',
|
| + required=True,
|
| + help='The brkitr/root.txt to filter')
|
| +
|
| + parser.add_argument('--out-brkitr-root-txt',
|
| + required=True,
|
| + help='The brkitr/root.txt to filter')
|
| +
|
| + parser.add_argument('--in-brkitr-ja-txt',
|
| + required=True,
|
| + help='The brkitr/ja.txt to filter')
|
| +
|
| + parser.add_argument('--out-brkitr-ja-txt',
|
| + required=True,
|
| + help='The brkitr/ja.txt to filter')
|
| +
|
| + parser.add_argument('--currency-keep-list',
|
| + required=True,
|
| + help="The file with currencies to keep")
|
| +
|
| + parser.add_argument('--in-curr-dir',
|
| + required=True,
|
| + help="The currency data files")
|
| +
|
| + parser.add_argument('--out-curr-dir',
|
| + required=True,
|
| + help="The filtered/copied currency data files")
|
| +
|
| + parser.add_argument('--minimize-language-list',
|
| + required=True,
|
| + help=("Comma separated list of languages (locales) " +
|
| + "to minimize the data for."))
|
| +
|
| + parser.add_argument('--in-locales-dir',
|
| + required=True,
|
| + help="The locales data files")
|
| +
|
| + parser.add_argument('--out-locales-dir',
|
| + required=True,
|
| + help="The filtered/copied locales data files")
|
| +
|
| + parser.add_argument('--in-lang-dir',
|
| + required=True,
|
| + help="The lang data files")
|
| +
|
| + parser.add_argument('--out-lang-dir',
|
| + required=True,
|
| + help="The filtered/copied lang data files")
|
| +
|
| + parser.add_argument('--remove-data-already-existing-in-android',
|
| + action="store_true",
|
| + help=("Removes data in lang/region/... that " +
|
| + "can be fetched from Android APIs"))
|
| +
|
| + parser.add_argument('--in-zone-dir',
|
| + required=True,
|
| + help="The zone data files")
|
| +
|
| + parser.add_argument('--out-zone-dir',
|
| + required=True,
|
| + help="The filtered/copied zone data files")
|
| +
|
| + args = parser.parse_args()
|
| +
|
| + if args.mode == 'strip-for-size':
|
| + fix_word_txt(args.in_word_txt, args.out_word_txt)
|
| + fix_brkitr_root_txt(args.in_brkitr_root_txt, args.out_brkitr_root_txt)
|
| + fix_brkitr_ja_txt(args.in_brkitr_ja_txt, args.out_brkitr_ja_txt)
|
| + fix_currencies(args.currency_keep_list,
|
| + args.in_curr_dir,
|
| + args.out_curr_dir)
|
| + minimize_data_for_locales(args.in_locales_dir,
|
| + args.out_locales_dir,
|
| + args.minimize_language_list)
|
| + remove_zone_example_cities(args.in_zone_dir, args.out_zone_dir)
|
| + else:
|
| + assert args.mode == 'clean-copy'
|
| + shutil.copyfile(args.in_word_txt, args.out_word_txt)
|
| + shutil.copyfile(args.in_brkitr_root_txt, args.out_brkitr_root_txt)
|
| + shutil.copyfile(args.in_brkitr_root_txt, args.out_brkitr_ja_txt)
|
| + copytree(args.in_curr_dir, args.out_curr_dir)
|
| + copytree(args.in_locales_dir, args.out_locales_dir)
|
| + copytree(args.in_zone_dir, args.out_zone_dir)
|
| +
|
| + if (args.mode == 'strip-for-size' and
|
| + args.remove_data_already_existing_in_android):
|
| + strip_android_langs(args.in_lang_dir, args.out_lang_dir)
|
| + else:
|
| + assert args.mode == 'clean-copy'
|
| + copytree(args.in_lang_dir, args.out_lang_dir)
|
| +
|
| +def fix_word_txt(source, dest):
|
| + """Strip all references to dictionaryCJK and KanaKanji and
|
| + HangulSymbal."""
|
| + with open(source) as in_file:
|
| + with open(dest, "w") as out:
|
| + _process_word_txt(in_file, out)
|
| +
|
| +WORD_TXT_REPLACEMENTS = (
|
| + ("$ALetter-$dictionaryCJK", "$ALetter"),
|
| + ("$ComplexContext $dictionaryCJK", "$ComplexContext"),
|
| + ("$dictionaryCJK", None),
|
| + ("special handling for CJK", None),
|
| + ("$HangulSyllable $HangulSyllable", None),
|
| + ("$KanaKanji $KanaKanji", None),
|
| +)
|
| +
|
| +def _process_word_txt(in_file, out):
|
| + for line in in_file:
|
| + for replacement_pattern, replace_with in WORD_TXT_REPLACEMENTS:
|
| + if replacement_pattern in line:
|
| + if replace_with is None:
|
| + # Delete line.
|
| + line = None
|
| + else:
|
| + line = line.replace(replacement_pattern,
|
| + replace_with)
|
| + break
|
| + if line is not None:
|
| + out.write(line)
|
| +
|
| +def fix_brkitr_root_txt(source, dest):
|
| + """Strip all references to cjdict."""
|
| + with open(source) as in_file:
|
| + with open(dest, "w") as out:
|
| + trigger1 = False
|
| + for line in in_file:
|
| + if "cjdict.dict" in line:
|
| + trigger1 = True
|
| + continue
|
| + out.write(line)
|
| + assert trigger1
|
| +
|
| +def fix_brkitr_ja_txt(source, dest):
|
| + """Strip all references to line_ja.brk."""
|
| + with open(source) as in_file:
|
| + with open(dest, "w") as out:
|
| + trigger1 = False
|
| + for line in in_file:
|
| + if 'line:process(dependency){"line_ja.brk"}' in line:
|
| + line = line.replace(
|
| + 'line:process(dependency){"line_ja.brk"}',
|
| + 'word:process(dependency){"word_ja.brk"}')
|
| + trigger1 = True
|
| + out.write(line)
|
| + assert trigger1
|
| +
|
| +def _print_block(line, in_file, out):
|
| + indentation_count = 0
|
| + for char in line:
|
| + if char != " ":
|
| + break
|
| + indentation_count += 1
|
| + indentation = " " * indentation_count
|
| +
|
| + if out is not None:
|
| + out.write(line)
|
| + if "}" in line:
|
| + return
|
| +
|
| + line = in_file.next()
|
| + while not line.startswith(indentation + "}"):
|
| + if out is not None:
|
| + out.write(line)
|
| + line = in_file.next()
|
| + if out is not None:
|
| + out.write(line)
|
| +
|
| +def _skip_block(line, in_file):
|
| + _print_block(line, in_file, None)
|
| +
|
| +def _print_or_skip_block(line, in_file, out, keep):
|
| + if not keep:
|
| + out = None
|
| + _print_block(line, in_file, out)
|
| +
|
| +def _block_name(line):
|
| + assert "{" in line
|
| + return line.split("{")[0].lstrip()
|
| +
|
| +def _copy_start_of_file(in_file, out, locale_name):
|
| + line = in_file.next()
|
| + while not line.startswith("%s{" % locale_name):
|
| + out.write(line)
|
| + line = in_file.next()
|
| + out.write(line)
|
| +
|
| +def fix_currencies(keep_list_file, in_curr_dir, out_curr_dir):
|
| + """Remove currencies that are not globally important."""
|
| +
|
| + with open(keep_list_file) as list_file:
|
| + currencies_to_keep = set(list_file.read().split())
|
| + currency_files = os.listdir(in_curr_dir)
|
| + for currency_file in currency_files:
|
| + in_file_name = os.path.join(in_curr_dir, currency_file)
|
| + out_file_name = os.path.join(out_curr_dir, currency_file)
|
| + if (not currency_file.endswith(".txt") or
|
| + currency_file == "supplementalData.txt"):
|
| + # For instance pool.res.
|
| + shutil.copyfile(in_file_name, out_file_name)
|
| + continue
|
| + locale_name = os.path.splitext(currency_file)[0]
|
| + with open(in_file_name) as in_file:
|
| + with open(out_file_name, "w") as out:
|
| + _process_currency_file(in_file, out, locale_name,
|
| + currencies_to_keep)
|
| +
|
| +def _process_currency_file(in_file, out, locale_name, currencies_to_keep):
|
| + try:
|
| + _copy_start_of_file(in_file, out, locale_name)
|
| + while True:
|
| + line = in_file.next()
|
| + if "{" in line:
|
| + block_name = _block_name(line)
|
| + if block_name in ("Currencies",
|
| + "Currencies%narrow",
|
| + "CurrencyPlurals"):
|
| + # Keep only certain currencies.
|
| + out.write(line)
|
| + line = in_file.next()
|
| + while not line.startswith(" }"):
|
| + if "{" in line:
|
| + currency_name = _block_name(line)
|
| + keep_curr = currency_name in currencies_to_keep
|
| + _print_or_skip_block(line, in_file, out, keep_curr)
|
| + else:
|
| + out.write(line)
|
| + line = in_file.next()
|
| + out.write(line)
|
| + else:
|
| + keep = block_name in ('"%%ALIAS"',
|
| + "%%Parent",
|
| + "currencyMap",
|
| + "CurrencyMap",
|
| + "currencyMeta",
|
| + "CurrencyMeta",
|
| + "currencySpacing",
|
| + "CurrencySpacing",
|
| + "currencyUnitPatterns",
|
| + "CurrencyUnitPatterns",
|
| + "Version")
|
| + _print_or_skip_block(line, in_file, out, keep)
|
| + else:
|
| + out.write(line)
|
| + except StopIteration:
|
| + pass
|
| +
|
| +def is_interesting_calendar(locale_name, calendar_name):
|
| + if calendar_name in ("generic", "gregorian"):
|
| + return True
|
| + interesting_calendar_map = {
|
| + "th": "buddhist",
|
| + "zh": "chinese",
|
| + "zh_Hant": "roc",
|
| + "ko": "dangi",
|
| + "am": "ethiopic",
|
| + "he": "hebrew",
|
| + "ar": "arabic",
|
| + "fa": "persian",
|
| + "ja": "japanese",
|
| + }
|
| + locale_parts = locale_name.split("_")
|
| + for i in range(len(locale_parts)):
|
| + base_locale = "_".join(locale_parts[:(i+1)])
|
| + if calendar_name == interesting_calendar_map.get(base_locale):
|
| + return True
|
| + return False
|
| +
|
| +def minimize_data_for_locales(locales_in_dir, locales_out_dir,
|
| + languages_to_minimize_str):
|
| + languages_to_minimize = set(languages_to_minimize_str.split(","))
|
| +
|
| + locale_files = os.listdir(locales_in_dir)
|
| + for locale_file in locale_files:
|
| + in_file_name = os.path.join(locales_in_dir, locale_file)
|
| + out_file_name = os.path.join(locales_out_dir, locale_file)
|
| + if not locale_file.endswith(".txt"):
|
| + # For instance pool.res.
|
| + shutil.copyfile(in_file_name, out_file_name)
|
| + continue
|
| + locale_name = os.path.splitext(locale_file)[0]
|
| +
|
| + with open(in_file_name) as in_file:
|
| + with open(out_file_name, "w") as out:
|
| + _process_locale_file(in_file, out, locale_name,
|
| + languages_to_minimize)
|
| +
|
| +def _process_locale_file(in_file, out, locale_name, languages_to_minimize):
|
| + try:
|
| + _copy_start_of_file(in_file, out, locale_name)
|
| + while True: # Until StopIteration exception.
|
| + # Keep all blocks after "Version" but only a few
|
| + # blocks before that.
|
| + line = in_file.next()
|
| + if "{" in line:
|
| + block_name = _block_name(line)
|
| + if (not locale_name in languages_to_minimize and
|
| + block_name == "calendar"):
|
| + out.write(line)
|
| + line = in_file.next()
|
| + while not line.startswith(" }"):
|
| + if "{" in line:
|
| + calendar_name = _block_name(line)
|
| + keep_cal = is_interesting_calendar(locale_name,
|
| + calendar_name)
|
| + _print_or_skip_block(line, in_file, out, keep_cal)
|
| + else:
|
| + out.write(line)
|
| + line = in_file.next()
|
| + out.write(line)
|
| + continue
|
| +
|
| + # Do not include '%%Parent' line on purpose.
|
| + keep = (not locale_name in languages_to_minimize or
|
| + block_name in (
|
| + '"%%ALIAS"',
|
| + "AuxExemplarCharacters",
|
| + "ExemplarCharacters",
|
| + "LocaleScript",
|
| + "layout",
|
| + "Version"))
|
| + _print_or_skip_block(line, in_file, out, keep)
|
| + else:
|
| + out.write(line)
|
| + except StopIteration:
|
| + pass
|
| +
|
| + # Note: patch_locale.sh also has code to strip only some calendars
|
| + # but since the calendar block is removed above that makes no
|
| + # difference.
|
| +
|
| +def strip_android_langs(in_lang_dir, out_lang_dir):
|
| + lang_files = os.listdir(in_lang_dir)
|
| + for lang_file in lang_files:
|
| + in_file_name = os.path.join(in_lang_dir, lang_file)
|
| + out_file_name = os.path.join(out_lang_dir, lang_file)
|
| + if not lang_file.endswith(".txt"):
|
| + # For instance pool.res
|
| + shutil.copyfile(in_file_name, out_file_name)
|
| + continue
|
| + locale_name = os.path.splitext(lang_file)[0]
|
| + with open(in_file_name) as in_file:
|
| + with open(out_file_name, "w") as out:
|
| + _process_lang_file(in_file, out, locale_name)
|
| +
|
| +def _process_lang_file(in_file, out, locale_name):
|
| + try:
|
| + _copy_start_of_file(in_file, out, locale_name)
|
| + while True: # Until StopIteration exception.
|
| + line = in_file.next()
|
| + if "{" in line:
|
| + block_name = _block_name(line)
|
| + if block_name in ("Languages", "Scripts"):
|
| + out.write(line)
|
| + parts_to_keep = {
|
| + "Languages": ("zh{",),
|
| + "Scripts": ("Hans{", "Hant{") }[block_name]
|
| + line = in_file.next()
|
| + while not line.startswith(" }"):
|
| + if line.strip().startswith(parts_to_keep):
|
| + out.write(line)
|
| + line = in_file.next()
|
| + out.write(line)
|
| + else:
|
| + keep = True
|
| + if block_name in ("Keys",
|
| + "LanguagesShort",
|
| + "Types",
|
| + "Variants",
|
| + "calendar",
|
| + "codePatterns",
|
| + "localeDisplayPattern"):
|
| + # Delete the whole block.
|
| + keep = False
|
| + _print_or_skip_block(line, in_file, out, keep)
|
| + else:
|
| + out.write(line)
|
| + except StopIteration:
|
| + pass
|
| +
|
| +def remove_zone_example_cities(in_zone_dir, out_zone_dir):
|
| + zone_files = os.listdir(in_zone_dir)
|
| + for zone_file in zone_files:
|
| + locale_name = os.path.splitext(zone_file)[0]
|
| + with open(os.path.join(in_zone_dir, zone_file)) as in_file:
|
| + with open(os.path.join(out_zone_dir, zone_file), "w") as out:
|
| + # Strip everything from zoneStrings to the first line with
|
| + # a "meta:" name.
|
| + pos = 0 # 0 = start -> keep, 1 = mid -> delete, 2 = end -> keep
|
| + if locale_name == "root":
|
| + pos = 2 # Keep everything in root.txt.
|
| + for line in in_file:
|
| + if pos == 0:
|
| + out.write(line)
|
| + if line.startswith(" zoneStrings"):
|
| + pos = 1
|
| + if pos == 1 and line.startswith(" \"meta:"):
|
| + pos = 2
|
| + if pos == 2:
|
| + out.write(line)
|
| +
|
| +if __name__ == '__main__':
|
| + main()
|
| +
|
|
|