bugfix in tatoeba data extraction for multilingual data files (language code clash)

This commit is contained in:
Joerg Tiedemann 2020-06-25 00:45:25 +03:00
parent 844f8bf72a
commit 9e186d82d6
6 changed files with 162 additions and 25 deletions

View File

@ -124,6 +124,12 @@ OPUSMONOCORPORA = $(filter-out ${EXCLUDE_CORPORA} ,${patsubst %/latest/mono/${LA
${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}})
## all languages in OPUS (requires the opus-langs.txt file)
ifneq (${wildcard opus-langs.txt},)
OPUSLANGS = ${filter-out simple,${shell head -1 opus-langs.txt}}
endif
ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
ALL_MULTILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}
@ -403,6 +409,13 @@ endif
## list of all languages in OPUS
opus-langs.txt:
wget -O $@.tmp http://opus.nlpl.eu/opusapi/?languages=true
grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@
rm -f $@.tmp
## make some data size-specific configuration parameters
## TODO: is it OK to delete LOCAL_TRAIN data?

View File

@ -90,9 +90,9 @@ else
# MARIAN = ${APPLHOME}/marian-dev/build-spm
# MARIANCPU = ${APPLHOME}/marian-dev/build-cpu
# MARIANSPM = ${APPLHOME}/marian-dev/build-spm
MARIAN = ${APPLHOME}/marian-dev/build-new
MARIANCPU = ${APPLHOME}/marian-dev/build-new
MARIANSPM = ${APPLHOME}/marian-dev/build-new
MARIAN = ${APPLHOME}/marian-dev/build
MARIANCPU = ${APPLHOME}/marian-dev/build
MARIANSPM = ${APPLHOME}/marian-dev/build
# GPU_MODULES = cuda intel-mkl
GPU = v100
GPU_MODULES = python-env

View File

@ -30,10 +30,10 @@
### ze_en - English subtitles in chinese movies
OPUSLANGS = fi sv fr es de ar he "cmn cn yue zhs zht zh ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue" "pt pt_br pt_BR pt_PT" aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb "az az_IR" bal bam ban bar bas ba bbc bbj bci bcl bem ber "be be_tarask" bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca "cbk cbk_zam" cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co "crh crh_latn" crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr "ms ms_MY" mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba "nb nn no nb_NO nn_NO no_nb" nog nch nci ncj ncs ncx ndc "nds nds_nl" nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq "sr srp sr_ME" srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl "ta ta_LK" tcf tcy tc tdt tdx tet te "tg tg_TJ" thv th tig tir tiv ti tkl tk tlh tll "tl tl_PH" tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh "ur ur_PK" usp uz vec vep ve "vi vi_VN" vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm "zul zu" zza
OPUSLANGS ?= fi sv fr es de ar he "cmn cn yue zhs zht zh ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue" "pt pt_br pt_BR pt_PT" aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb "az az_IR" bal bam ban bar bas ba bbc bbj bci bcl bem ber "be be_tarask" bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca "cbk cbk_zam" cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co "crh crh_latn" crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr "ms ms_MY" mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba "nb nn no nb_NO nn_NO no_nb" nog nch nci ncj ncs ncx ndc "nds nds_nl" nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq "sr srp sr_ME" srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl "ta ta_LK" tcf tcy tc tdt tdx tet te "tg tg_TJ" thv th tig tir tiv ti tkl tk tlh tll "tl tl_PH" tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh "ur ur_PK" usp uz vec vep ve "vi vi_VN" vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm "zul zu" zza
ALLOPUSLANGS = fi sv fr es de ar he cmn cn yue zhs zht zh ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue pt pt_br pt_BR pt_PT aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb az az_IR bal bam ban bar bas ba bbc bbj bci bcl bem ber be be_tarask bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca cbk cbk_zam cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co crh crh_latn crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr ms ms_MY mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba nb nn no nb_NO nn_NO no_nb nog nch nci ncj ncs ncx ndc nds nds_nl nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq sr srp sr_ME srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl ta ta_LK tcf tcy tc tdt tdx tet te tg tg_TJ thv th tig tir tiv ti tkl tk tlh tll tl tl_PH tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh ur ur_PK usp uz vec vep ve vi vi_VN vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm zul zu zza
ALLOPUSLANGS ?= fi sv fr es de ar he cmn cn yue zhs zht zh ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue pt pt_br pt_BR pt_PT aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb az az_IR bal bam ban bar bas ba bbc bbj bci bcl bem ber be be_tarask bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca cbk cbk_zam cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co crh crh_latn crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr ms ms_MY mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba nb nn no nb_NO nn_NO no_nb nog nch nci ncj ncs ncx ndc nds nds_nl nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq sr srp sr_ME srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl ta ta_LK tcf tcy tc tdt tdx tet te tg tg_TJ thv th tig tir tiv ti tkl tk tlh tll tl tl_PH tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh ur ur_PK usp uz vec vep ve vi vi_VN vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm zul zu zza
iso639-3:
${HOME}/projappl/ISO639/iso639 -3 ${ALLOPUSLANGS}

View File

@ -56,16 +56,122 @@
## taken from the Tatoeba-Challenge Makefile
## requires local data for setting TATOEBA_LANGS
ISO639 = iso639
GET_ISO_CODE = ${ISO639} -m
TATOEBA_LANGS = ${sort ${patsubst %.txt.gz,%,${notdir ${wildcard ${OPUSHOME}/Tatoeba/latest/mono/*.txt.gz}}}}
TATOEBA_LANGS3 = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${TATOEBA_LANGS}}}}
TATOEBA_LANGGROUPS = ${shell langgroup -g -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
TATOEBA_LANGGROUPS2 = ${shell langgroup -G -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
## OPUS LANGS
OPUS_LANGS3 = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
OPUS_LANGGROUPS = ${shell langgroup -g -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
OPUS_LANGGROUPS2 = ${shell langgroup -G -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
## combined (to make sure we don't miss anything)
OPUSTATOEBA_LANGS3 = ${sort ${OPUS_LANGS3} ${TATOEBA_LANGS3}}
OPUSTATOEBA_LANGGROUPS = ${shell langgroup -g -n ${OPUSTATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
OPUSTATOEBA_LANGGROUPS2 = ${shell langgroup -G -n ${OPUSTATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
# some special language models
TATOEBA_WESTGERMANIC = ${sort eng nld gos hrx swg prg nld yid deu ltz fry nds afr bar ang enm sco}
tatoeba-westgermanic:
${MAKE} SRCLANGS="${TATOEBA_WESTGERMANIC}" TRGLANGS="${TATOEBA_WESTGERMANIC}" \
FIT_DATA_SIZE=100000 \
LANGPAIRSTR=westgermanic \
FIT_DATA_SIZE=100000 LANGPAIRSTR=westgermanic \
tatoeba-multilingual-train
tatoeba-westgermanice-eval:
${MAKE} SRCLANGS="${TATOEBA_WESTGERMANIC}" TRGLANGS="${TATOEBA_WESTGERMANIC}" \
FIT_DATA_SIZE=100000 LANGPAIRSTR=westgermanic \
tatoeba-multilingual-eval
${MAKE} SRCLANGS="${TATOEBA_WESTGERMANIC}" TRGLANGS="${TATOEBA_WESTGERMANIC}" \
FIT_DATA_SIZE=100000 LANGPAIRSTR=westgermanic \
dist-tatoeba
###########################################################################################
# language groups
###########################################################################################
## print language groups
tatoeba-langgroups:
@echo ${TATOEBA_LANGGROUPS}
@echo ${TATOEBA_LANGGROUPS2}
opus-langgroups:
@echo ${OPUSTATOEBA_LANGGROUPS}
@echo ${OPUSTATOEBA_LANGGROUPS2}
## multilingual models for language groups
tatoeba-langgroup:
for g in ${TATOEBA_LANGGROUPS}; do \
l=`echo $$g | sed 's/\+/ /g'`; \
n=`langgroup -p $$l | cut -f1 -d' '`; \
${MAKE} LANGPAIRSTR="$$n-$$n" TRGLANGS="$$l" SRCLANGS="$$l" \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
done
## models for language groups to English
tatoeba-group2eng:
for g in ${TATOEBA_LANGGROUPS}; do \
l=`echo $$g | sed 's/\+/ /g'`; \
n=`langgroup -p $$l | cut -f1 -d' '`; \
${MAKE} LANGPAIRSTR="$$n-eng" SRCLANGS="$$l" TRGLANGS=eng \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
done
## models for English to language groups
tatoeba-eng2group:
for g in ${TATOEBA_LANGGROUPS}; do \
l=`echo $$g | sed 's/\+/ /g'`; \
n=`langgroup -p $$l | cut -f1 -d' '`; \
${MAKE} LANGPAIRSTR="eng-$$n" TRGLANGS="$$l" SRCLANGS=eng \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
done
##-------------------------------------------------------------------
## multilingual models
## with all OPUS data not only the languages that have Tatoeba data
##-------------------------------------------------------------------
## multilingual models for language groups
tatoeba-all-langgroup:
for g in ${OPUSTATOEBA_LANGGROUPS}; do \
l=`echo $$g | sed 's/\+/ /g'`; \
n=`langgroup -p $$l | cut -f1 -d' '`; \
${MAKE} LANGPAIRSTR="all-$$n-$$n" TRGLANGS="$$l" SRCLANGS="$$l" \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
done
## models for language groups to English
tatoeba-all-group2eng:
for g in ${OPUSTATOEBA_LANGGROUPS}; do \
l=`echo $$g | sed 's/\+/ /g'`; \
n=`langgroup -p $$l | cut -f1 -d' '`; \
${MAKE} LANGPAIRSTR="all-$$n-eng" SRCLANGS="$$l" TRGLANGS=eng \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
done
## models for English to language groups
tatoeba-all-eng2group:
for g in ${OPUSTATOEBA_LANGGROUPS}; do \
l=`echo $$g | sed 's/\+/ /g'`; \
n=`langgroup -p $$l | cut -f1 -d' '`; \
${MAKE} LANGPAIRSTR="all-eng-$$n" TRGLANGS="$$l" SRCLANGS=eng \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
done
###########################################################################################
@ -504,9 +610,13 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
cut -f2 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
ifeq (${SRC},zho)
echo -n 'zho zho_Hans zho_Hant cmn' >> $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
tr ' ' "\n" < $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) | sort -u | tr "\n" ' ' >$(@:.${SRCEXT}.gz=.${SRCEXT}.labels).tmp
mv $(@:.${SRCEXT}.gz=.${SRCEXT}.labels).tmp $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
endif
ifeq (${TRG},zho)
echo -n 'zho zho_Hans zho_Hant cmn' >> $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
tr ' ' "\n" < $(@:.${SRCEXT}.gz=.${TRGEXT}.labels) | sort -u | tr "\n" ' ' >$(@:.${SRCEXT}.gz=.${TRGEXT}.labels).tmp
mv $(@:.${SRCEXT}.gz=.${TRGEXT}.labels).tmp $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
endif
rm -f $@.d/data/${LANGPAIR}/*
rmdir $@.d/data/${LANGPAIR}
@ -556,22 +666,35 @@ endif
done \
done
#######################################
# finally, compress the big datafiles
# and cleanup
# finally, remove the big data files
# with all the different language variants
#######################################
for d in dev test train; do \
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
else \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
fi; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
done
# #######################################
# # finally, compress the big datafiles
# # and cleanup
# #######################################
# for d in dev test train; do \
# if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
# ${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
# ${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
# else \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
# fi; \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
# done
# #######################################
# # special treatment for Chinese
# # - simplified vs traditional script
@ -643,7 +766,7 @@ results/tatoeba-results%.md: tatoeba-results% tatoeba-results-BLEU-sorted-model
echo "because the models are not yet released or their performance is too poor" >> $@
echo "to be useful for anything." >> $@
echo "" >>$@
echo "| Model | LangPair | chrF2 | BLEU |" >> $@
echo "| Model | Language Pair | chrF2 | BLEU |" >> $@
echo "|-----------------:|------------|-----------:|---------:|" >> $@
( p=`grep -P 'ref_len = 1?[0-9]?[0-9]\)' tatoeba-results-BLEU-sorted-model | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'`; \
grep -v -P "\t($$p)\t" $< |\
@ -654,8 +777,8 @@ results/tatoeba-results-chrF2%.md: tatoeba-results-chrF2% tatoeba-results-BLEU-s
mkdir -p ${dir $@}
echo "# Tatoeba translation results" >$@
echo "" >>$@
echo "| Model | LangPair | chrF2 |" >> $@
echo "|-----------------:|------------|-----------:|" >> $@
echo "| Model | Language Pair | chrF2 |" >> $@
echo "|-----------------:|------------|-----------:|" >> $@
( p=`grep -P 'ref_len = 1?[0-9]?[0-9]\)' tatoeba-results-BLEU-sorted-model | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'`; \
grep -v -P "\t($$p)\t" $< |\
sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ )
@ -664,8 +787,8 @@ results/tatoeba-results-BLEU%.md: tatoeba-results-BLEU% tatoeba-results-BLEU-sor
mkdir -p ${dir $@}
echo "# Tatoeba translation results" >$@
echo "" >>$@
echo "| Model | LangPair | BLEU | Details |" >> $@
echo "|-----------------:|------------|-----------:|---------:|" >> $@
echo "| Model | Language Pair | BLEU | Details |" >> $@
echo "|-----------------:|------------|-----------:|---------:|" >> $@
( p=`grep -P 'ref_len = 1?[0-9]?[0-9]\)' tatoeba-results-BLEU-sorted-model | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'`; \
grep -v -P "\t($$p)\t" $< |\
sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ )

View File

@ -20,9 +20,9 @@ endif
%.submit:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "${LANGSTR}-${DATASET}-${@:.submit=}"' >>$@
echo '#SBATCH -o ${LANGSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e ${LANGSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@
echo '#SBATCH -J "${LANGPAIRSTR}-${DATASET}-${@:.submit=}"' >>$@
echo '#SBATCH -o ${LANGPAIRSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e ${LANGPAIRSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
echo '#SBATCH --exclude=r18g08' >> $@
ifdef EMAIL
@ -35,6 +35,7 @@ endif
ifeq (${shell hostname --domain},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}' >> $@
# echo '#SBATCH --exclude=r18g02' >> $@
else
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS}' >> $@
endif

View File

@ -26,7 +26,7 @@ eval-testsets:
%-testsets-langpair: ${TESTSETS_PRESRC} ${TESTSETS_PRETRG}
@echo "testsets: ${TESTSET_DIR}/*.${SRCEXT}.gz"
for t in ${TESTSETS}; do \
${MAKE} TESTSET=$$t TESTSET_NAME=$$t ${@:-testsets-langpair=}; \
${MAKE} TESTSET=$$t TESTSET_NAME=$$t-${SRC}${TRG} ${@:-testsets-langpair=}; \
done