OLD | NEW |
(Empty) | |
| 1 """Rewrites data so that it becomes smaller after compilation.""" |
| 2 |
| 3 from __future__ import print_function |
| 4 |
| 5 import argparse |
| 6 import os |
| 7 import shutil |
| 8 |
| 9 def _mergetree(source_dir, dest_dir): |
| 10 files = os.listdir(source_dir) |
| 11 for f in files: |
| 12 src = os.path.join(source_dir, f) |
| 13 dest = os.path.join(dest_dir, f) |
| 14 if os.path.isdir(src): |
| 15 copytree(src, dest) |
| 16 else: |
| 17 shutil.copy2(src, dest) |
| 18 |
| 19 def copytree(source, dest): |
| 20 """Like shutil.copytree but can handle that dest exists and |
| 21 merge/overwrite the files there in that case.""" |
| 22 if not os.path.isdir(source): |
| 23 shutil.copy2(source, dest) |
| 24 elif os.path.isdir(dest): |
| 25 _mergetree(source, dest) |
| 26 else: |
| 27 shutil.copytree(source, dest) |
| 28 |
| 29 def main(): |
| 30 parser = argparse.ArgumentParser( |
| 31 description=('Generates an icudata.lst file to be compiled by icu.')) |
| 32 |
| 33 parser.add_argument('--mode', |
| 34 required=True, |
| 35 choices=['clean-copy', 'strip-for-size'], |
| 36 help='Whether the files should be changed or not.') |
| 37 |
| 38 parser.add_argument('--in-word-txt', |
| 39 required=True, |
| 40 help='The word.txt to filter') |
| 41 |
| 42 parser.add_argument('--out-word-txt', |
| 43 required=True, |
| 44 help='The word.txt to filter') |
| 45 |
| 46 parser.add_argument('--in-brkitr-root-txt', |
| 47 required=True, |
| 48 help='The brkitr/root.txt to filter') |
| 49 |
| 50 parser.add_argument('--out-brkitr-root-txt', |
| 51 required=True, |
| 52 help='The brkitr/root.txt to filter') |
| 53 |
| 54 parser.add_argument('--in-brkitr-ja-txt', |
| 55 required=True, |
| 56 help='The brkitr/ja.txt to filter') |
| 57 |
| 58 parser.add_argument('--out-brkitr-ja-txt', |
| 59 required=True, |
| 60 help='The brkitr/ja.txt to filter') |
| 61 |
| 62 parser.add_argument('--currency-keep-list', |
| 63 required=True, |
| 64 help="The file with currencies to keep") |
| 65 |
| 66 parser.add_argument('--in-curr-dir', |
| 67 required=True, |
| 68 help="The currency data files") |
| 69 |
| 70 parser.add_argument('--out-curr-dir', |
| 71 required=True, |
| 72 help="The filtered/copied currency data files") |
| 73 |
| 74 parser.add_argument('--minimize-language-list', |
| 75 required=True, |
| 76 help=("Comma separated list of languages (locales) " + |
| 77 "to minimize the data for.")) |
| 78 |
| 79 parser.add_argument('--in-locales-dir', |
| 80 required=True, |
| 81 help="The locales data files") |
| 82 |
| 83 parser.add_argument('--out-locales-dir', |
| 84 required=True, |
| 85 help="The filtered/copied locales data files") |
| 86 |
| 87 parser.add_argument('--in-lang-dir', |
| 88 required=True, |
| 89 help="The lang data files") |
| 90 |
| 91 parser.add_argument('--out-lang-dir', |
| 92 required=True, |
| 93 help="The filtered/copied lang data files") |
| 94 |
| 95 parser.add_argument('--remove-data-already-existing-in-android', |
| 96 action="store_true", |
| 97 help=("Removes data in lang/region/... that " + |
| 98 "can be fetched from Android APIs")) |
| 99 |
| 100 parser.add_argument('--in-zone-dir', |
| 101 required=True, |
| 102 help="The zone data files") |
| 103 |
| 104 parser.add_argument('--out-zone-dir', |
| 105 required=True, |
| 106 help="The filtered/copied zone data files") |
| 107 |
| 108 args = parser.parse_args() |
| 109 |
| 110 if args.mode == 'strip-for-size': |
| 111 fix_word_txt(args.in_word_txt, args.out_word_txt) |
| 112 fix_brkitr_root_txt(args.in_brkitr_root_txt, args.out_brkitr_root_txt) |
| 113 fix_brkitr_ja_txt(args.in_brkitr_ja_txt, args.out_brkitr_ja_txt) |
| 114 fix_currencies(args.currency_keep_list, |
| 115 args.in_curr_dir, |
| 116 args.out_curr_dir) |
| 117 minimize_data_for_locales(args.in_locales_dir, |
| 118 args.out_locales_dir, |
| 119 args.minimize_language_list) |
| 120 remove_zone_example_cities(args.in_zone_dir, args.out_zone_dir) |
| 121 else: |
| 122 assert args.mode == 'clean-copy' |
| 123 shutil.copyfile(args.in_word_txt, args.out_word_txt) |
| 124 shutil.copyfile(args.in_brkitr_root_txt, args.out_brkitr_root_txt) |
| 125 shutil.copyfile(args.in_brkitr_root_txt, args.out_brkitr_ja_txt) |
| 126 copytree(args.in_curr_dir, args.out_curr_dir) |
| 127 copytree(args.in_locales_dir, args.out_locales_dir) |
| 128 copytree(args.in_zone_dir, args.out_zone_dir) |
| 129 |
| 130 if (args.mode == 'strip-for-size' and |
| 131 args.remove_data_already_existing_in_android): |
| 132 strip_android_langs(args.in_lang_dir, args.out_lang_dir) |
| 133 else: |
| 134 assert args.mode == 'clean-copy' |
| 135 copytree(args.in_lang_dir, args.out_lang_dir) |
| 136 |
| 137 def fix_word_txt(source, dest): |
| 138 """Strip all references to dictionaryCJK and KanaKanji and |
| 139 HangulSymbal.""" |
| 140 with open(source) as in_file: |
| 141 with open(dest, "w") as out: |
| 142 _process_word_txt(in_file, out) |
| 143 |
| 144 WORD_TXT_REPLACEMENTS = ( |
| 145 ("$ALetter-$dictionaryCJK", "$ALetter"), |
| 146 ("$ComplexContext $dictionaryCJK", "$ComplexContext"), |
| 147 ("$dictionaryCJK", None), |
| 148 ("special handling for CJK", None), |
| 149 ("$HangulSyllable $HangulSyllable", None), |
| 150 ("$KanaKanji $KanaKanji", None), |
| 151 ) |
| 152 |
| 153 def _process_word_txt(in_file, out): |
| 154 for line in in_file: |
| 155 for replacement_pattern, replace_with in WORD_TXT_REPLACEMENTS: |
| 156 if replacement_pattern in line: |
| 157 if replace_with is None: |
| 158 # Delete line. |
| 159 line = None |
| 160 else: |
| 161 line = line.replace(replacement_pattern, |
| 162 replace_with) |
| 163 break |
| 164 if line is not None: |
| 165 out.write(line) |
| 166 |
| 167 def fix_brkitr_root_txt(source, dest): |
| 168 """Strip all references to cjdict.""" |
| 169 with open(source) as in_file: |
| 170 with open(dest, "w") as out: |
| 171 trigger1 = False |
| 172 for line in in_file: |
| 173 if "cjdict.dict" in line: |
| 174 trigger1 = True |
| 175 continue |
| 176 out.write(line) |
| 177 assert trigger1 |
| 178 |
| 179 def fix_brkitr_ja_txt(source, dest): |
| 180 """Strip all references to line_ja.brk.""" |
| 181 with open(source) as in_file: |
| 182 with open(dest, "w") as out: |
| 183 trigger1 = False |
| 184 for line in in_file: |
| 185 if 'line:process(dependency){"line_ja.brk"}' in line: |
| 186 line = line.replace( |
| 187 'line:process(dependency){"line_ja.brk"}', |
| 188 'word:process(dependency){"word_ja.brk"}') |
| 189 trigger1 = True |
| 190 out.write(line) |
| 191 assert trigger1 |
| 192 |
| 193 def _print_block(line, in_file, out): |
| 194 indentation_count = 0 |
| 195 for char in line: |
| 196 if char != " ": |
| 197 break |
| 198 indentation_count += 1 |
| 199 indentation = " " * indentation_count |
| 200 |
| 201 if out is not None: |
| 202 out.write(line) |
| 203 if "}" in line: |
| 204 return |
| 205 |
| 206 line = in_file.next() |
| 207 while not line.startswith(indentation + "}"): |
| 208 if out is not None: |
| 209 out.write(line) |
| 210 line = in_file.next() |
| 211 if out is not None: |
| 212 out.write(line) |
| 213 |
| 214 def _skip_block(line, in_file): |
| 215 _print_block(line, in_file, None) |
| 216 |
| 217 def _print_or_skip_block(line, in_file, out, keep): |
| 218 if not keep: |
| 219 out = None |
| 220 _print_block(line, in_file, out) |
| 221 |
| 222 def _block_name(line): |
| 223 assert "{" in line |
| 224 return line.split("{")[0].lstrip() |
| 225 |
| 226 def _copy_start_of_file(in_file, out, locale_name): |
| 227 line = in_file.next() |
| 228 while not line.startswith("%s{" % locale_name): |
| 229 out.write(line) |
| 230 line = in_file.next() |
| 231 out.write(line) |
| 232 |
| 233 def fix_currencies(keep_list_file, in_curr_dir, out_curr_dir): |
| 234 """Remove currencies that are not globally important.""" |
| 235 |
| 236 with open(keep_list_file) as list_file: |
| 237 currencies_to_keep = set(list_file.read().split()) |
| 238 currency_files = os.listdir(in_curr_dir) |
| 239 for currency_file in currency_files: |
| 240 in_file_name = os.path.join(in_curr_dir, currency_file) |
| 241 out_file_name = os.path.join(out_curr_dir, currency_file) |
| 242 if (not currency_file.endswith(".txt") or |
| 243 currency_file == "supplementalData.txt"): |
| 244 # For instance pool.res. |
| 245 shutil.copyfile(in_file_name, out_file_name) |
| 246 continue |
| 247 locale_name = os.path.splitext(currency_file)[0] |
| 248 with open(in_file_name) as in_file: |
| 249 with open(out_file_name, "w") as out: |
| 250 _process_currency_file(in_file, out, locale_name, |
| 251 currencies_to_keep) |
| 252 |
| 253 def _process_currency_file(in_file, out, locale_name, currencies_to_keep): |
| 254 try: |
| 255 _copy_start_of_file(in_file, out, locale_name) |
| 256 while True: |
| 257 line = in_file.next() |
| 258 if "{" in line: |
| 259 block_name = _block_name(line) |
| 260 if block_name in ("Currencies", |
| 261 "Currencies%narrow", |
| 262 "CurrencyPlurals"): |
| 263 # Keep only certain currencies. |
| 264 out.write(line) |
| 265 line = in_file.next() |
| 266 while not line.startswith(" }"): |
| 267 if "{" in line: |
| 268 currency_name = _block_name(line) |
| 269 keep_curr = currency_name in currencies_to_keep |
| 270 _print_or_skip_block(line, in_file, out, keep_curr) |
| 271 else: |
| 272 out.write(line) |
| 273 line = in_file.next() |
| 274 out.write(line) |
| 275 else: |
| 276 keep = block_name in ('"%%ALIAS"', |
| 277 "%%Parent", |
| 278 "currencyMap", |
| 279 "CurrencyMap", |
| 280 "currencyMeta", |
| 281 "CurrencyMeta", |
| 282 "currencySpacing", |
| 283 "CurrencySpacing", |
| 284 "currencyUnitPatterns", |
| 285 "CurrencyUnitPatterns", |
| 286 "Version") |
| 287 _print_or_skip_block(line, in_file, out, keep) |
| 288 else: |
| 289 out.write(line) |
| 290 except StopIteration: |
| 291 pass |
| 292 |
| 293 def is_interesting_calendar(locale_name, calendar_name): |
| 294 if calendar_name in ("generic", "gregorian"): |
| 295 return True |
| 296 interesting_calendar_map = { |
| 297 "th": "buddhist", |
| 298 "zh": "chinese", |
| 299 "zh_Hant": "roc", |
| 300 "ko": "dangi", |
| 301 "am": "ethiopic", |
| 302 "he": "hebrew", |
| 303 "ar": "arabic", |
| 304 "fa": "persian", |
| 305 "ja": "japanese", |
| 306 } |
| 307 locale_parts = locale_name.split("_") |
| 308 for i in range(len(locale_parts)): |
| 309 base_locale = "_".join(locale_parts[:(i+1)]) |
| 310 if calendar_name == interesting_calendar_map.get(base_locale): |
| 311 return True |
| 312 return False |
| 313 |
| 314 def minimize_data_for_locales(locales_in_dir, locales_out_dir, |
| 315 languages_to_minimize_str): |
| 316 languages_to_minimize = set(languages_to_minimize_str.split(",")) |
| 317 |
| 318 locale_files = os.listdir(locales_in_dir) |
| 319 for locale_file in locale_files: |
| 320 in_file_name = os.path.join(locales_in_dir, locale_file) |
| 321 out_file_name = os.path.join(locales_out_dir, locale_file) |
| 322 if not locale_file.endswith(".txt"): |
| 323 # For instance pool.res. |
| 324 shutil.copyfile(in_file_name, out_file_name) |
| 325 continue |
| 326 locale_name = os.path.splitext(locale_file)[0] |
| 327 |
| 328 with open(in_file_name) as in_file: |
| 329 with open(out_file_name, "w") as out: |
| 330 _process_locale_file(in_file, out, locale_name, |
| 331 languages_to_minimize) |
| 332 |
| 333 def _process_locale_file(in_file, out, locale_name, languages_to_minimize): |
| 334 try: |
| 335 _copy_start_of_file(in_file, out, locale_name) |
| 336 while True: # Until StopIteration exception. |
| 337 # Keep all blocks after "Version" but only a few |
| 338 # blocks before that. |
| 339 line = in_file.next() |
| 340 if "{" in line: |
| 341 block_name = _block_name(line) |
| 342 if (not locale_name in languages_to_minimize and |
| 343 block_name == "calendar"): |
| 344 out.write(line) |
| 345 line = in_file.next() |
| 346 while not line.startswith(" }"): |
| 347 if "{" in line: |
| 348 calendar_name = _block_name(line) |
| 349 keep_cal = is_interesting_calendar(locale_name, |
| 350 calendar_name) |
| 351 _print_or_skip_block(line, in_file, out, keep_cal) |
| 352 else: |
| 353 out.write(line) |
| 354 line = in_file.next() |
| 355 out.write(line) |
| 356 continue |
| 357 |
| 358 # Do not include '%%Parent' line on purpose. |
| 359 keep = (not locale_name in languages_to_minimize or |
| 360 block_name in ( |
| 361 '"%%ALIAS"', |
| 362 "AuxExemplarCharacters", |
| 363 "ExemplarCharacters", |
| 364 "LocaleScript", |
| 365 "layout", |
| 366 "Version")) |
| 367 _print_or_skip_block(line, in_file, out, keep) |
| 368 else: |
| 369 out.write(line) |
| 370 except StopIteration: |
| 371 pass |
| 372 |
| 373 # Note: patch_locale.sh also has code to strip only some calendars |
| 374 # but since the calendar block is removed above that makes no |
| 375 # difference. |
| 376 |
| 377 def strip_android_langs(in_lang_dir, out_lang_dir): |
| 378 lang_files = os.listdir(in_lang_dir) |
| 379 for lang_file in lang_files: |
| 380 in_file_name = os.path.join(in_lang_dir, lang_file) |
| 381 out_file_name = os.path.join(out_lang_dir, lang_file) |
| 382 if not lang_file.endswith(".txt"): |
| 383 # For instance pool.res |
| 384 shutil.copyfile(in_file_name, out_file_name) |
| 385 continue |
| 386 locale_name = os.path.splitext(lang_file)[0] |
| 387 with open(in_file_name) as in_file: |
| 388 with open(out_file_name, "w") as out: |
| 389 _process_lang_file(in_file, out, locale_name) |
| 390 |
| 391 def _process_lang_file(in_file, out, locale_name): |
| 392 try: |
| 393 _copy_start_of_file(in_file, out, locale_name) |
| 394 while True: # Until StopIteration exception. |
| 395 line = in_file.next() |
| 396 if "{" in line: |
| 397 block_name = _block_name(line) |
| 398 if block_name in ("Languages", "Scripts"): |
| 399 out.write(line) |
| 400 parts_to_keep = { |
| 401 "Languages": ("zh{",), |
| 402 "Scripts": ("Hans{", "Hant{") }[block_name] |
| 403 line = in_file.next() |
| 404 while not line.startswith(" }"): |
| 405 if line.strip().startswith(parts_to_keep): |
| 406 out.write(line) |
| 407 line = in_file.next() |
| 408 out.write(line) |
| 409 else: |
| 410 keep = True |
| 411 if block_name in ("Keys", |
| 412 "LanguagesShort", |
| 413 "Types", |
| 414 "Variants", |
| 415 "calendar", |
| 416 "codePatterns", |
| 417 "localeDisplayPattern"): |
| 418 # Delete the whole block. |
| 419 keep = False |
| 420 _print_or_skip_block(line, in_file, out, keep) |
| 421 else: |
| 422 out.write(line) |
| 423 except StopIteration: |
| 424 pass |
| 425 |
| 426 def remove_zone_example_cities(in_zone_dir, out_zone_dir): |
| 427 zone_files = os.listdir(in_zone_dir) |
| 428 for zone_file in zone_files: |
| 429 locale_name = os.path.splitext(zone_file)[0] |
| 430 with open(os.path.join(in_zone_dir, zone_file)) as in_file: |
| 431 with open(os.path.join(out_zone_dir, zone_file), "w") as out: |
| 432 # Strip everything from zoneStrings to the first line with |
| 433 # a "meta:" name. |
| 434 pos = 0 # 0 = start -> keep, 1 = mid -> delete, 2 = end -> keep |
| 435 if locale_name == "root": |
| 436 pos = 2 # Keep everything in root.txt. |
| 437 for line in in_file: |
| 438 if pos == 0: |
| 439 out.write(line) |
| 440 if line.startswith(" zoneStrings"): |
| 441 pos = 1 |
| 442 if pos == 1 and line.startswith(" \"meta:"): |
| 443 pos = 2 |
| 444 if pos == 2: |
| 445 out.write(line) |
| 446 |
| 447 if __name__ == '__main__': |
| 448 main() |
| 449 |
OLD | NEW |