store and fetch work data

This commit is contained in:
Joerg Tiedemann 2020-08-22 23:51:37 +03:00
parent d7252e32b7
commit 0e27198048
4 changed files with 647 additions and 45 deletions

View File

@ -191,6 +191,21 @@ all: ${WORKDIR}/config.mk
${MAKE} compare
#---------------------------------------------------------------------
# store and fetch workdata
# (requires module load allas && allas-conf)
#---------------------------------------------------------------------
## store workdir on allas
store:
cd ${WORKHOME} && a-put -b OPUS-MT_${notdir ${WORKHOME}} --override ${LANGPAIRSTR}
# rm -fr ${WORKHOME}/${LANGPAIRSTR}
## fetch workdir from allas
fetch:
cd ${WORKHOME} && a-get OPUS-MT_${notdir ${WORKHOME}}/${LANGPAIRSTR}
#---------------------------------------------------------------------
# run everything including backtranslation of wiki-data

11
TODO.md
View File

@ -33,3 +33,14 @@
* collaboration with wikimedia
* focus languages: Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
## Tatoeba MT models
Labels are only taken from test data but this can be a problem if there are relevant data sets that will be missed out
* example: nor (there is only nno nob in the test data but most of the data for Norwegian is only tagged as nor_Latn);
* another example: hbs (hbs labels do not exist in test data)
* possible solution: take all labels from train data; problem: some noisy labels may influence the model a lot and it would be better to leave them out (wrong script data etc); another issue: over-sampling data sets that only exist in train data may damage the model

View File

@ -3,6 +3,10 @@
# Makefile for running models with data from the Tatoeba Translation Challenge
# https://github.com/Helsinki-NLP/Tatoeba-Challenge
#
# NEWS
#
# - MODELTYPE=transformer is now default for all Tatoeba models
# (no guided alignment!)
#
#---------------------------------------------------------------------
# train and evaluate a single translation pair, for example:
@ -179,8 +183,13 @@ tatoeba-labels: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.lab
.PHONY: tatoeba-results
tatoeba-results:
rm -f tatoeba-results* results/*.md
rm -f tatoeba-results* tatoeba-models-all results/*.md
${MAKE} tatoeba-results-md
rm -f models-tatoeba/released-models.txt
${MAKE} models-tatoeba/released-models.txt
.PHONY: tatoeba-released-models
tatoeba-released-models: models-tatoeba/released-models.txt
## create result tables in various variants and for various subsets
## markdown pages are for reading on-line in the Tatoeba Challenge git
@ -405,6 +414,22 @@ tatoeba-%-train:
fi \
fi )
tatoeba-%-data:
-( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-data,%,$@))); \
t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-data,%,$@))); \
S="$(filter ${OPUS_LANGS3},$(sort ${shell langgroup $(firstword $(subst 2, ,$(patsubst tatoeba-%-data,%,$@))) | xargs iso639 -m -n}))"; \
T="$(filter ${OPUS_LANGS3},$(sort ${shell langgroup $(lastword $(subst 2, ,$(patsubst tatoeba-%-data,%,$@))) | xargs iso639 -m -n}))"; \
if [ `echo $$S | tr ' ' "\n" | wc -l` -ge ${MIN_SRCLANGS} ]; then \
if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \
if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \
if [ `echo $$T | tr ' ' "\n" | wc -l` -le ${MAX_TRGLANGS} ]; then \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-prepare; \
fi \
fi \
fi \
fi )
tatoeba-%-eval:
( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-eval,%,$@))); \
@ -646,6 +671,7 @@ tatoeba-multilingual-testsets:
LANGPAIRSTR=${LANGPAIRSTR} \
SRCLANGS="${shell cat ${word 1,$^} | sed 's/ *$$//;s/^ *//'}" \
TRGLANGS="${shell cat ${word 2,$^} | sed 's/ *$$//;s/^ *//'}" \
MODELTYPE=transformer \
SRC=${SRC} TRG=${TRG} \
EMAIL= \
${@:-tatoeba=}; \
@ -733,6 +759,23 @@ ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
${TATOEBA_DATA}/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz \
${TATOEBA_DATA}/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
##-------------------------------------------------------------
## take care of languages IDs
## --> simplify some IDs from training data
## --> decide which ones to keep that do not exist in test data
##-------------------------------------------------------------
## langids that we want to keep from the training data even if they do not exist in the Tatoeba test sets
## (skip most lang-IDs because they mostly come from erroneous writing scripts --> errors in the data)
## the list is based on Tatoeba-Challenge/data/langids-train-only.txt
TRAIN_ONLY_LANGIDS = ${shell cat tatoeba/langids-train-only.txt | tr "\n" ' '}
KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur gcf got gug hbs hbs_Cyrl hmn \
jak_Latn kam kmr kmr_Latn kom kur_Cyrl kuv_Arab kuv_Latn lld mol mrj msa_Latn mya_Cakm nep ngu \
nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}}
SKIP_LANGIDS_PATTERN = \(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)
## modify language IDs in training data to adjust them to test sets
## --> fix codes for chinese and take away script information (not reliable!)
@ -746,9 +789,16 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
| sed 's/jpn_[A-Za-z]*/jpn/g' \
| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;s/apc_Latn/apc/' \
| sed 's/kor_[A-Za-z]*/kor/g' \
| sed 's/nor_Latn/nor/g' \
| sed 's/syr_Syrc/syr/g' \
| sed 's/yid_Latn/yid/g' \
| perl -pe 'if (/(cjy|cmn|gan|lzh|nan|wuu|yue|zho)_([A-Za-z]{4})/){if ($$2 ne "Hans" && $$2 ne "Hant"){s/(cjy|cmn|gan|lzh|nan|wuu|yue|zho)_([A-Za-z]{4})/$$1/} }'
print-skiplangids:
@echo ${SKIP_LANGIDS_PATTERN}
## monolingual data from Tatoeba challenge (wiki data)
${TATOEBA_MONO}/%.labels:
@ -816,33 +866,12 @@ ${TATOEBA_MONO}/%.labels:
touch ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
fi
#######################################
# save all labels in the data
# TODO: de we also need labels from train data?
# save all lang labels that appear the data
#######################################
# cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
# cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
if [ -e ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id ]; then \
cut -f1 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels); \
cut -f2 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels); \
fi
#######################################
# special treatment for Chinese: add cmn without script info
# (because it is common in train but not in test data)
#######################################
ifeq (${SRC},zho)
if [ -e $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) ]; then \
if [ `grep 'cmn ' $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) | wc -l` -eq 0 ]; then \
echo -n 'cmn' >> $(@:.${SRCEXT}.gz=.${SRCEXT}.labels); \
fi \
fi
endif
ifeq (${TRG},zho)
if [ -e $(@:.${SRCEXT}.gz=.${TRGEXT}.labels) ]; then \
if [ `grep 'cmn ' $(@:.${SRCEXT}.gz=.${TRGEXT}.labels) | wc -l` -eq 0 ]; then \
echo -n 'cmn' >> $(@:.${SRCEXT}.gz=.${TRGEXT}.labels); \
fi \
fi
endif
cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | grep -v '${SKIP_LANGIDS_PATTERN}' | \
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | grep -v '${SKIP_LANGIDS_PATTERN}' | \
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
#######################################
# cleanup temporary data
#######################################
@ -919,24 +948,38 @@ endif
done
## old fix for Chinese: add zho and variants ...
#
# ifeq (${SRC},zho)
# if [ -e $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) ]; then \
# echo -n 'zho zho_Hans zho_Hant cmn' >> $(@:.${SRCEXT}.gz=.${SRCEXT}.labels); \
# tr ' ' "\n" < $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) | \
# sort -u | tr "\n" ' ' >$(@:.${SRCEXT}.gz=.${SRCEXT}.labels).tmp; \
# mv $(@:.${SRCEXT}.gz=.${SRCEXT}.labels).tmp $(@:.${SRCEXT}.gz=.${SRCEXT}.labels); \
# fi
# endif
# ifeq (${TRG},zho)
# if [ -e $(@:.${SRCEXT}.gz=.${TRGEXT}.labels) ]; then \
# echo -n 'zho zho_Hans zho_Hant cmn' >> $(@:.${SRCEXT}.gz=.${TRGEXT}.labels); \
# tr ' ' "\n" < $(@:.${SRCEXT}.gz=.${TRGEXT}.labels) | \
# sort -u | tr "\n" ' ' >$(@:.${SRCEXT}.gz=.${TRGEXT}.labels).tmp; \
# mv $(@:.${SRCEXT}.gz=.${TRGEXT}.labels).tmp $(@:.${SRCEXT}.gz=.${TRGEXT}.labels); \
# fi
# endif
fixlabels.sh:
# @for l in nor-swe; do
for l in `find work-tatoeba/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \
s=`echo $$l | cut -f1 -d'-'`; \
t=`echo $$l | cut -f2 -d'-'`; \
if [ "$$s" \< "$$t" ]; then \
${MAKE} TATOEBA_WORK=work-tatoeba-fixed SRCLANGS=$$s TRGLANGS=$$t tatoeba-labels; \
o=`cat work-tatoeba/data/simple/Tatoeba-train.$$l.clean.$$s.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \
n=`cat work-tatoeba-fixed/data/simple/Tatoeba-train.$$l.clean.$$s.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \
O=`cat work-tatoeba/data/simple/Tatoeba-train.$$l.clean.$$t.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \
N=`cat work-tatoeba-fixed/data/simple/Tatoeba-train.$$l.clean.$$t.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \
if [ "$$o" != "$$n" ] || [ "$$O" != "$$N" ] ; then \
echo "# labels in $$l are different ($$o / $$O - $$n / $$N)" >> $@; \
if [ -d work-tatoeba/$$l ]; then \
echo "# re-run $$l from scratch!" >> $@; \
echo "${MAKE} TATOEBA_WORK=work-tatoeba-fixed SRCLANGS=$$s TRGLANGS=$$t tatoeba-job" >> $@; \
fi; \
if [ -d work-tatoeba/$$t-$$s ]; then \
echo "# re-run $$t-$$s from scratch!" >> $@; \
echo "${MAKE} TATOEBA_WORK=work-tatoeba-fixed SRCLANGS=$$t TRGLANGS=$$s tatoeba-job" >> $@; \
fi \
else \
if [ -d work-tatoeba/$$l ]; then \
echo "mv work-tatoeba/$$l work-tatoeba-fixed/$$l" >> $@; \
fi; \
if [ -d work-tatoeba/$$t-$$s ]; then \
echo "mv work-tatoeba/$$t-$$s work-tatoeba-fixed/$$t-$$s" >> $@; \
fi \
fi; \
fi \
done
@ -1036,6 +1079,21 @@ tatoeba-models-all:
rm -f $@.model $@.langpair $@.testset $@.chrF2 $@.bleu $@.bp $@.reflen
rm -f $@.modeldir $@.dataset $@.1 $@.2
models-tatoeba/released-models.txt:
find models-tatoeba/ -name '*.eval.txt' | sort | xargs grep chrF2 > $@.1
find models-tatoeba/ -name '*.eval.txt' | sort | xargs grep BLEU > $@.2
cut -f3 -d '/' $@.1 | sed 's/\.eval.txt.*$$/.zip/;s#^#${TATOEBA_DATAURL}/#' > $@.url
cut -f2 -d '/' $@.1 > $@.iso639-3
cut -f2 -d '/' $@.1 | xargs iso639 -2 -k -p | tr ' ' "\n" > $@.iso639-1
cut -f2 -d '=' $@.1 | cut -f2 -d ' ' > $@.chrF2
cut -f2 -d '=' $@.2 | cut -f2 -d ' ' > $@.bleu
cut -f3 -d '=' $@.2 | cut -f2 -d ' ' > $@.bp
cut -f6 -d '=' $@.2 | cut -f2 -d ' ' | cut -f1 -d')' > $@.reflen
cut -f2 -d '/' $@.1 | sed 's/^\([^ \-]*\)$$/\1-\1/g' | tr '-' ' ' | \
xargs iso639 -k | sed 's/$$/ /' |\
sed -e 's/\" \"\([^\"]*\)\" /\t\1\n/g' | sed 's/^\"//g' > $@.langs
paste $@.url $@.iso639-3 $@.iso639-1 $@.chrF2 $@.bleu $@.bp $@.reflen $@.langs > $@
rm -f $@.url $@.iso639-3 $@.iso639-1 $@.chrF2 $@.bleu $@.bp $@.reflen $@.1 $@.2 $@.langs
tatoeba-results-all-subset-%: tatoeba-%.md tatoeba-results-all-sorted-langpair

View File

@ -0,0 +1,518 @@
abk_Latn
aeb_Latn
amh_Latn
ang
ang_Arab
ang_Beng
ang_Cyrl
ang_Deva
ang_Grek
ang_Gujr
ang_Guru
ang_Hang
ang_Hani
ang_Mlym
ang_Orya
ang_Taml
ara_Cyrl
ara_Deva
ara_Grek
ara_Hang
ara_Hani
ara_Hebr
ara_Latn_TN
ara_SY
ara_TN
ara_Zinh
arg_Arab
arq_Cyrl
ary_Latn
asm_Arab
asm_Cyrl
asm_Deva
asm_Latn
ast_Grek
ast_Hani
ast_Hira
aze_Arab_IR
aze_Latn_IR
bam_Telu
bel_Hani
bel_Kana
ben_Arab
ben_Arab_IN
ben_Cyrl_IN
ben_Deva_IN
ben_Gujr_IN
ben_IN
ben_Latn
ben_Latn_IN
ber_Kana
bho_Latn
bod_Latn
bos_Arab
bos_Cyrl
bos_Deva
bua_Latn
bub_Latn
bul_Arab
bul_Grek
bul_Hani
bul_Hebr
bxr
bxr_Cyrl
bxr_Latn
cat_Cyrl
cat_Grek
cdo_Hans
cdo_Hant
cdo_Latn
ces_Arab
ces_Cyrl
ces_Geor
ces_Grek
ces_Hang
ces_Hani
ces_Hebr
cha_Hang
che_Latn
chr_Latn
chv_Latn
ckb
ckb_Latn
cla
cla_Cyrl
cla_Latn
cmn_Adlm_CN
cmn_Arab_CN
cmn_Bopo_CN
cmn_Bopo_TW
cmn_CN
cmn_Cyrl_CN
cmn_Cyrl_TW
cmn_Grek_CN
cmn_Grek_TW
cmn_Hani_CN
cmn_Hani_TW
cmn_Hans_CN
cmn_Hans_TW
cmn_Hant_CN
cmn_Hant_TW
cmn_Hira_CN
cmn_Hira_TW
cmn_Kana_CN
cmn_Kana_TW
cmn_Latn_CN
cmn_Latn_TW
cmn_Mand_CN
cmn_Mani_CN
cmn_Phlp_CN
cmn_Rohg_CN
cmn_Sogd_CN
cmn_Syrc_CN
cmn_Thaa_CN
cmn_TW
cnr
cnr_Latn
csb
dan_Arab
dan_Cyrl
dan_Grek
deu_Arab
deu_AT
deu_CH
deu_Cyrl
deu_Geor
deu_Grek
deu_Hang
deu_Hani
deu_Thai
diq
dnj
dty
ell_Adlm
ell_Arab
ell_Cyrl
ell_Hani
ell_Hebr
eng_Arab
eng_AU
eng_CA
eng_Cyrl
eng_Deva
eng_Deva_CA
eng_Deva_GB
eng_GB
eng_Geor
eng_Grek
eng_Hang
eng_Hani
eng_Hani_GB
eng_Hebr
eng_Knda
eng_NZ
eng_Sinh
eng_Thai
eng_ZA
est_Cyrl
fas
fas_Cyrl
fas_Grek
fas_Hebr
fas_Latn
fas_Zinh
fin_Arab
fin_Cyrl
fin_Hebr
fin_Thai
fqs
fqs_Hani
fqs_Latn
fqs_Zinh
fra_Arab
fra_CA
fra_Cyrl
fra_Geor
fra_Grek
fra_Hang
fra_Hani
fra_Hebr
fra_Thai
ful
fur
fur_Latn
gan_Arab
gcf
glg_Grek
gom_Knda
gom_Latn
got
got_Latn
grc_Latn
gug
guj_Arab
guj_Cyrl
guj_Latn
hbs
hbs_Cyrl
hbs_Grek
hbs_Latn
hbs_Nkoo
heb_Arab
heb_Beng
heb_Cyrl
heb_Deva
heb_Grek
heb_Guru
heb_Hani
heb_Kana
heb_Mlym
heb_Orya
heb_Sinh
heb_Taml
hin_Latn
hmn
hrv_Cyrl
hun_Arab
hun_Cyrl
hun_Hebr
ido_Bopo
ido_Deva
ido_Hani
iku_Cans
iku_Latn
ind_Cyrl
ind_Grek
ind_Hang
ind_Hebr
isl_Geor
ita_Arab
ita_Cyrl
ita_Grek
ita_Hang
ita_Hani
ita_Hebr
ita_Thai
jak_Latn
jav_Hani
jav_Hira
jpn_Arab
jpn_Cyrl
jpn_Geor
jpn_Grek
jpn_Hebr
jpn_Thai
kab_Hani
kam
kam_Latn
kan_Latn
kat_Cyrl
kat_Latn
kaz
kir_Arab
kir_Deva
kir_Latn
kmr
kmr_Latn
kok_Latn
kom
kor_Bopo
kor_Cyrl
kor_Grek
kor_Hebr
kor_Hira
kor_Kana
kor_Yiii
kur_Cyrl
kuv_Arab
kuv_Latn
lao_Latn
lit_Geor
lit_Thai
lkt_Hang
lld
lzh_Grek
mai_Latn
mal_Latn
mar_Kana
mar_Latn
mhr_Latn
mkd_Latn
mlt_Grek
mol
mon_Latn
mrj
msa_Cyrl
msa_Grek
msa_Hang
msa_Latn
mya_Cakm
mya_Latn
myv_Latn
nan_Bopo
nan_Hang
nds_Arab
nds_Beng
nds_Cyrl
nds_Deva
nds_Grek
nds_Gujr
nds_Guru
nds_Hani
nds_Hira
nds_Mlym
nds_NL
nds_Orya
nds_Taml
nep
nep_Arab
nep_Cyrl
nep_Latn
ngu
nld_Arab
nld_Cyrl
nld_Geor
nld_Grek
nld_Hang
nld_Hebr
nno_Deva
nob_Arab
nob_Beng
nob_Cyrl
nob_Deva
nob_Grek
nob_Hani
nob_Hira
nob_Mlym
nob_Orya
nob_Taml
nor_Arab
nor_Cyrl
nor_Geor
nor_Grek
nor_Hang
nor_Latn
ori_Cyrl
ori_Latn
oss_Latn
pam_Latn
pan
pan_Latn
plt
pms_Hani
pnb_Guru
pob
pob_Arab
pob_Cyrl
pob_Grek
pob_Hang
pob_Hebr
pol_Arab
pol_Cyrl
pol_Geor
pol_Grek
pol_Hang
pol_Hebr
por_Arab
por_Cyrl
por_Grek
por_Hang
por_Hebr
por_PT
prs
prs_Latn
pus_Latn
qug
quw
quy
quz
qvi
rmn
rmy
ron_Arab
ron_Cyrl
ron_Hang
ron_Hebr
ruk
ruk_Latn
rus_Adlm
rus_Arab
rus_Deva
rus_Geor
rus_Grek
rus_Hang
rus_Hani
rus_Hebr
rus_Rohg
rus_Syrc
rus_Thaa
san
san_Latn
sin_Latn
slv_Cyrl
slv_Hebr
snd_Latn
spa_AR
spa_Arab
spa_CL
spa_CO
spa_CR
spa_Cyrl
spa_DO
spa_EC
spa_Grek
spa_GT
spa_Hang
spa_Hani
spa_Hebr
spa_HN
spa_MX
spa_NI
spa_PA
spa_PE
spa_PR
spa_SV
spa_Thai
spa_UY
spa_VE
srp_Arab
srp_Latn_ME
srp_Zinh
swa
swc
swe_Arab
swe_Cyrl
swe_Grek
swe_Hani
syr_Syrc
tam_Deva_LK
tam_Latn
tam_Latn_LK
tam_LK
tat_Hang
tel_Latn
tgk_Latn
tgl_Grek
tgl_Hani
tha_Latn
thy
thy_Hani
thy_Latn
tir_Latn
tlh
tmh
toi
tuk_Cyrl
tur_Arab
tur_Cyrl
tur_Geor
tur_Grek
tur_Hang
tur_Hebr
uig_Hang
ukr_Arab
ukr_Geor
ukr_Grek
ukr_Latn
umb_Cyrl
urd_Deva
urd_Latn
vie_Cyrl
vie_Geor
vie_Hebr
vie_Thai
wuu_Beng
xal_Hani
xal_Latn
yid_Latn
yue
yue_Kana
yue_Thai
zho
zho_Arab
zho_Arab_CN
zho_Armn
zho_Bopo
zho_Bopo_CN
zho_Bopo_HK
zho_Bopo_TW
zho_CN
zho_Cyrl
zho_Cyrl_CN
zho_Cyrl_TW
zho_Ethi
zho_Grek
zho_Hang
zho_Hang_CN
zho_Hang_TW
zho_Hani
zho_Hani_CN
zho_Hani_HK
zho_Hani_TW
zho_Hans
zho_Hans_CN
zho_Hans_HK
zho_Hans_TW
zho_Hant
zho_Hant_CN
zho_Hant_HK
zho_Hant_TW
zho_Hebr
zho_Hira
zho_Hira_CN
zho_Hira_TW
zho_HK
zho_Kana
zho_Kana_CN
zho_Kana_TW
zho_Latn
zho_Latn_CN
zho_Latn_HK
zho_Latn_TW
zho_Telu
zho_TW
zho_Yiii
zho_Yiii_CN
zho_Yiii_TW
zho_Zinh
zlm