Index: scripts/trim_data.sh |
diff --git a/scripts/trim_data.sh b/scripts/trim_data.sh |
index 83858f9f0392fcef11afc953d215fa97c3d26796..8b3abd5406d7407f98cc7f1f55f586eaeeffbef4 100755 |
--- a/scripts/trim_data.sh |
+++ b/scripts/trim_data.sh |
@@ -51,6 +51,8 @@ function abridge_locale_data_for_non_ui_languages { |
target=${localedatapath}/${lang}.txt |
[ -e ${target} ] || { echo "missing ${lang}"; continue; } |
echo Overwriting ${target} ... |
+ |
+ # Do not include '%%Parent' line on purpose. |
sed -n -r -i \ |
'1, /^'${lang}'\{$/p |
/^ "%%ALIAS"\{/p |
@@ -69,8 +71,11 @@ function abridge_locale_data_for_non_ui_languages { |
target=${langdatapath}/${lang}.txt |
[ -e ${target} ] || { echo "missing ${lang}"; continue; } |
echo Overwriting ${target} ... |
+ |
+ # Do not include '%%Parent' line on purpose. |
sed -n -r -i \ |
'1, /^'${lang}'\{$/p |
+ /^ "%%ALIAS"\{/p |
/^ Languages\{$/, /^ \}$/ { |
/^ Languages\{$/p |
/^ '${lang}'\{.*\}$/p |
@@ -80,37 +85,58 @@ function abridge_locale_data_for_non_ui_languages { |
done |
} |
-# Drop historic currencies. |
+# Keep only the currencies used by the larget 150 economies in terms of GDP. |
# TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies. |
# See also http://en.wikipedia.org/wiki/List_of_circulating_currencies |
function filter_currency_data { |
- for currency in $(grep -v '^#' currencies_to_drop.list) |
+ unset KEEPLIST |
+ for currency in $(grep -v '^#' currencies.list) |
do |
- OP=${DROPLIST:+|} |
- DROPLIST=${DROPLIST}${OP}${currency} |
+ OP=${KEEPLIST:+|} |
+ KEEPLIST=${KEEPLIST}${OP}${currency} |
done |
- DROPLIST="(${DROPLIST})\{" |
+ KEEPLIST="(${KEEPLIST})" |
- cd "${dataroot}/curr" |
- for i in *.txt |
+ for i in ${dataroot}/curr/*.txt |
do |
- [ $i != 'supplementalData.txt' ] && \ |
- sed -r -i '/^ '$DROPLIST'/, /^ }/ d' $i |
+ locale=$(basename $i .txt) |
+ [ $locale == 'supplementalData' ] && continue; |
+ echo "Overwriting $i for $locale" |
+ sed -n -r -i \ |
+ '1, /^'${locale}'\{$/ p |
+ /^ "%%ALIAS"\{/p |
+ /^ %%Parent\{/p |
+ /^ Currencies\{$/, /^ \}$/ { |
+ /^ Currencies\{$/ p |
+ /^ '$KEEPLIST'\{$/, /^ \}$/ p |
+ /^ \}$/ p |
+ } |
+ /^ Currencies%narrow\{$/, /^ \}$/ { |
+ /^ Currencies%narrow\{$/ p |
+ /^ '$KEEPLIST'\{".*\}$/ p |
+ /^ \}$/ p |
+ } |
+ /^ CurrencyPlurals\{$/, /^ \}$/ { |
+ /^ CurrencyPlurals\{$/ p |
+ /^ '$KEEPLIST'\{$/, /^ \}$/ p |
+ /^ \}$/ p |
+ } |
+ /^ [cC]urrency(Map|Meta|Spacing|UnitPatterns)\{$/, /^ \}$/ p |
+ /^ Version\{.*\}$/p |
+ /^\}$/p' $i |
done |
} |
# Remove the display names for numeric region codes other than |
# 419 (Latin America) because we don't use them. |
function filter_region_data { |
- cd "${dataroot}/region" |
- sed -i '/[0-35-9][0-9][0-9]{/ d' *.txt |
+ sed -i '/[0-35-9][0-9][0-9]{/ d' ${dataroot}/region/*.txt |
} |
function remove_exemplar_cities { |
- cd "${dataroot}/zone" |
- for i in *.txt |
+ for i in ${dataroot}/zone/*.txt |
do |
[ $i != 'root.txt' ] && \ |
sed -i '/^ zoneStrings/, /^ "meta:/ { |
@@ -122,8 +148,8 @@ function remove_exemplar_cities { |
} |
# Keep only duration and compound in units* sections. |
-function filter_locale_data { |
- for i in ${dataroot}/locales/*.txt |
+function filter_unit_data { |
+ for i in ${dataroot}/unit/*.txt |
do |
echo Overwriting $i ... |
sed -r -i \ |
@@ -138,10 +164,10 @@ function filter_locale_data { |
# big5han and gb2312han collation do not make any sense and nobody uses them. |
function remove_legacy_chinese_codepoint_collation { |
- echo "Removing Big5 / GB2312 collation data from Chinese locale" |
+ echo "Removing Big5 / GB2312 / UniHan collation data from Chinese locale" |
target="${dataroot}/coll/zh.txt" |
echo "Overwriting ${target}" |
- sed -r -i '/^ (big5|gb2312)han\{$/,/^ \}$/ d' ${target} |
+ sed -r -i '/^ (uni|big5|gb2312)han\{$/,/^ \}$/ d' ${target} |
} |
dataroot="$(dirname $0)/../source/data" |
@@ -155,7 +181,7 @@ abridge_locale_data_for_non_ui_languages |
filter_currency_data |
filter_region_data |
remove_legacy_chinese_codepoint_collation |
-filter_locale_data |
+filter_unit_data |
# Chromium OS needs exemplar cities for timezones, but not Chromium. |
# It'll save 400kB (uncompressed), but the size difference in |