pivot-based translations added

This commit is contained in:
Joerg Tiedemann 2020-05-17 22:43:05 +03:00
parent 1246bcd271
commit b01b4f22c3
9 changed files with 365 additions and 34 deletions

View File

@ -190,6 +190,7 @@ all: ${WORKDIR}/config.mk
${MAKE} compare
#---------------------------------------------------------------------
# run everything including backtranslation of wiki-data
#

View File

@ -11,10 +11,6 @@ include ../lib/slurm.mk
SRC = af
TRG = en
## set to 1 if the model for backtranslation is a multi-target model
## --> need to use pre-processing script differently
MULTI_TARGET_MODEL = 0
## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE = wiki
@ -34,7 +30,7 @@ LANGPAIR = ${SRC}-${TRG}
PWD := $(shell pwd)
MODELHOME = ../models/${LANGPAIR}
MODELHOME ?= ../models/${LANGPAIR}
## standard sort is different from UTF8-based sort
## --> prefer models with augmented data sets (separated by +)
## we need the UTF8 sort order
@ -43,7 +39,6 @@ MODELHOME = ../models/${LANGPAIR}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-langid/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
@ -51,6 +46,15 @@ ifeq (${MODELNAME},)
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
## set to 1 if the model for backtranslation is a multi-target model
## --> need to use pre-processing script differently
ifeq ($(words $(subst +, ,$(lastword $(subst -, ,$(notdir ${MODELHOME}))))),1)
MULTI_TARGET_MODEL = 0
else
MULTI_TARGET_MODEL = 1
endif
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif

View File

@ -45,7 +45,7 @@ OPUSREAD_ARGS =
## ELRA corpora
ELRA_CORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
${patsubst ${OPUSHOME}/%,%,\
${shell ls ${OPUSHOME}/ELRA-*/latest/xml/${LANGPAIR}.xml.gz}}}
${shell ls ${OPUSHOME}/ELRA-*/latest/xml/${LANGPAIR}.xml.gz 2>/dev/null}}}
## exclude certain data sets
## TODO: include ELRA corpora
@ -85,7 +85,10 @@ DEVMINSIZE = 250
## size of heldout data for each sub-corpus
## (only if there is at least twice as many examples in the corpus)
HELDOUTSIZE = ${DEVSIZE}
## NEW: set to 0
# HELDOUTSIZE = ${DEVSIZE}
HELDOUTSIZE = 0
##----------------------------------------------------------------------------
## train/dev/test data

View File

@ -3,7 +3,7 @@
SRCLANGS ?= ${SRC}
TRGLANGS ?= ${TRG}
THREADS ?= ${HPC_CORES}
THREADS ?= ${HPC_CORES}
## SKIP_LANGPAIRS can be used to skip certain language pairs
@ -33,6 +33,7 @@ TRAINDATA_SIZE = ${shell \
grep 'total size (${DATASET}):' ${WORKDIR}/train/README.md | cut -f2 -d':' ; \
fi }
## look for cleanup scripts and put them into a pipe
## they should be executable and should basically read STDIN and print to STDOUT
## no further arguments are supported
@ -46,6 +47,10 @@ ifneq (${wildcard scripts/cleanup/${TRG}},)
endif
##-------------------------------------------------------------
## backtranslated data and pivot-based synthetic training data
##-------------------------------------------------------------
## back translation data
## - use only the latest backtranslations
## if such a subdir exists
@ -56,15 +61,23 @@ else
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
endif
## TODO: make it possible to select only parts of the BT data
## ---> use TRAINDATA_SIZE to take max the same amount of all shuffled BT data
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}}
ifeq (${USE_PIVOTING},1)
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}} \
${sort ${wildcard pivoting/${SRC}-${TRG}/latest/*.${SRCEXT}.gz} \
${wildcard pivoting/${TRG}-${SRC}/latest/*.${SRCEXT}.gz}}
else
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}}
endif
BACKTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${BACKTRANS_SRC}}
##-------------------------------------------------------------
## data sets (train/dev/test)
##-------------------------------------------------------------
ifeq (${USE_BACKTRANS},1)
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} ${BACKTRANS_SRC}
@ -88,9 +101,11 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN
##-------------------------------------------------------------
## make data in reverse direction without re-doing word alignment etc ...
## ---> this is dangerous when things run in parallel
## ---> only works for bilingual models
##-------------------------------------------------------------
REV_LANGSTR = ${subst ${SPACE},+,$(TRGLANGS)}-${subst ${SPACE},+,$(SRCLANGS)}
REV_WORKDIR = ${WORKHOME}/${REV_LANGSTR}
@ -170,6 +185,8 @@ clean-data:
clean-data-source: ${DATA_SRC} ${DATA_TRG}
## monolingual data sets (for sentence piece models)
mono-data: ${LOCAL_MONO_DATA}.${PRE}
@ -343,10 +360,10 @@ add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
echo ${CLEAN_TRAIN_TRG}; \
fi
ifneq (${CLEAN_TRAIN_SRC},)
echo -n "* ${LANGPAIR}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
echo -n "* ${SRC}-${TRG}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo -n "* ${LANGPAIR} backtranslations: ${basename ${basename ${dir ${BACKTRANS_SRC}}}}, " \
echo -n "* ${SRC}-${TRG} (synthetic): ${basename ${patsubst %.${SRC}.gz,%,${notdir ${BACKTRANS_SRC}}}}, " \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif

View File

@ -242,7 +242,7 @@ listallmodels:
## include all backtranslation data as well in training
## start from the pre-trained opus model if it exists
BT_MODEL = ${MODEL_SUBDIR}opus+bt${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
BT_MODEL = ${MODEL_SUBDIR}${DATASET}+bt${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.${MODEL_VOCABTYPE}
@ -265,6 +265,24 @@ endif
PIVOT_MODEL = ${MODEL_SUBDIR}${DATASET}+pivot${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
PIVOT_MODEL_BASE = ${PIVOT_MODEL}.${MODELTYPE}.model${NR}
PIVOT_MODEL_START = ${WORKDIR}/${PIVOT_MODEL_BASE}.npz
PIVOT_MODEL_VOCAB = ${WORKDIR}/${PIVOT_MODEL}.vocab.${MODEL_VOCABTYPE}
%-pivot:
ifneq (${wildcard ${MODEL_FINAL}},)
ifeq (${wildcard ${PIVOT_MODEL_START}},)
cp ${MODEL_FINAL} ${PIVOT_MODEL_START}
cp ${MODEL_VOCAB} ${PIVOT_MODEL_VOCAB}
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+pivot \
USE_BACKTRANS=1 USE_PIVOTING=1 \
MARIAN_EARLY_STOPPING=15 \
${@:-pivot=}

View File

@ -12,25 +12,25 @@
# NO_MEMAD = ${filter-out fi sv de fr nl,${iso639}}
#"de_AT de_CH de_DE de"
#"en_AU en_CA en_GB en_NZ en_US en_ZA en"
#"it_IT if"
#"es_AR es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE es"
#"eu_ES eu"
#"hi_IN hi"
#"fr_BE fr_CA fr_FR fr"
#"fa_AF fa_IR fa"
#"ar_SY ar_TN ar"
#"bn_IN bn"
#da_DK
#bg_BG
#nb_NO
#nl_BE nl_NL
#tr_TR
#"de de_AT de_CH de_DE"
#"en en_AU en_CA en_GB en_NZ en_US en_ZA"
#"it it_IT"
#"es es_AR es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE"
#"eu eu_ES"
#"hi hi_IN"
#"fr fr_BE fr_CA fr_FR"
#"fa fa_AF fa_IR"
#"ar ar_SY ar_TN"
#"bn bn_IN"
#da da_DK
#bg bg_BG
#nb nb_NO
#nl nl_BE nl_NL
#tr tr_TR
### ze_en - English subtitles in chinese movies
OPUSLANGS = fi sv fr es de ar he "cmn cn yue ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue zhs zht zh" "pt_br pt_BR pt_PT pt" aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb "az_IR az" bal bam ban bar bas ba bbc bbj bci bcl bem ber "be_tarask be" bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca "cbk_zam cbk" cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co "crh_latn crh" crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr "ms_MY ms" mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba "nb_NO nb nn_NO nn nog no_nb no" nch nci ncj ncs ncx ndc "nds_nl nds" nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq "sr_ME sr srp" srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl "ta_LK ta" tcf tcy tc tdt tdx tet te "tg_TJ tg" thv th tig tir tiv ti tkl tk tlh tll "tl_PH tl" tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh "ur_PK ur" usp uz vec vep ve "vi_VN vi" vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm "zul zu" zza
OPUSLANGS = fi sv fr es de ar he "cmn cn yue zhs zht zh ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue" "pt pt_br pt_BR pt_PT" aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb "az az_IR" bal bam ban bar bas ba bbc bbj bci bcl bem ber "be be_tarask" bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca "cbk cbk_zam" cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co "crh crh_latn" crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr "ms ms_MY" mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba "nb nn no nb_NO nn_NO no_nb" nog nch nci ncj ncs ncx ndc "nds nds_nl" nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq "sr srp sr_ME" srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl "ta ta_LK" tcf tcy tc tdt tdx tet te "tg tg_TJ" thv th tig tir tiv ti tkl tk tlh tll "tl tl_PH" tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh "ur ur_PK" usp uz vec vep ve "vi vi_VN" vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm "zul zu" zza
allopus2pivot:

272
pivoting/Makefile Normal file
View File

@ -0,0 +1,272 @@
#
# pivoting - translate training data via pivot models
#
#
PWD := $(shell pwd)
## language (SRC->TRG) pair we need
SRC = fi
TRG = se
## pivot language
PIVOT = nb
## langpair (sorted lang id's) of the original data
## to be translated from PIVOT to SRC
ORIGINAL_LANGPAIR = ${firstword ${sort ${PIVOT} ${TRG}}}-${lastword ${sort ${PIVOT} ${TRG}}}
PIVOT_LANGPAIR = ${PIVOT}-${SRC}
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
ORIGINAL_DATADIR ?= ${PWD}/../work/data
ORIGINAL_DATASETS_SRC = ${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${ORIGINAL_LANGPAIR}.clean.${PIVOT}.gz}
ORIGINAL_DATASETS_TRG = ${patsubst %.${PIVOT}.gz,%.${TRG}.gz,${ORIGINAL_DATASRC}}
ORIGINAL_DATASRC ?= ${firstword ${ORIGINAL_DATASETS_SRC}}
ORIGINAL_DATATRG ?= ${firstword ${ORIGINAL_DATASETS_TRG}}
DATASET_NAMES = $(patsubst %.clean,%,$(patsubst %.${PIVOT}.gz,%,${notdir ${ORIGINAL_DATASETS_SRC}}))
DATASET_NAME = $(patsubst %.clean,%,$(patsubst %.${PIVOT}.gz,%,${notdir ${ORIGINAL_DATASRC}}))
## output dir
OUTPUT_DIR = ${SRC}-${TRG}
#--------------------------------------------------------------------------
# find a model to translate the original data
#--------------------------------------------------------------------------
## if there is no model for the pivot translation pair
## ---> look for a multilingual model that includes both languages
## TODO: make sure that PIVOT and SRC do not match other lang-IDs!!!
## TODO: what do we do if there is more than one multilingual model?
## --> need to define preference mechanism
## TODO: this should better come from some API or at least ObjectStorage
## (not local disk)
ifeq ($(wildcard ../models/${PIVOT_LANGPAIR}),)
ifeq ($(wildcard ../models/*${PIVOT}*-${SRC}),)
ifeq ($(wildcard ../models/${PIVOT}-*${SRC}*),)
MODELHOME = $(firstword $(wildcard ../models/*${PIVOT}*-*${SRC}*))
else
MODELHOME = $(firstword $(wildcard ../models/${PIVOT}-*${SRC}*))
endif
else
MODELHOME = $(firstword $(wildcard ../models/*${PIVOT}*-${SRC}))
endif
else
MODELHOME = ../models/${PIVOT_LANGPAIR}
endif
## select the latest NMT model (assume this is the best one)
## standard sort is different from UTF8-based sort
## --> prefer models with augmented data sets (separated by +)
## we need the UTF8 sort order
## --> use bash sort and UTF8 locale
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
## no released model found?
## ---> find a local one in the work directory
ifeq (${MODELNAME},)
ifeq ($(wildcard ../work/models/${PIVOT_LANGPAIR}),)
ifeq ($(wildcard ../work/models/*${PIVOT}*-${SRC}),)
ifeq ($(wildcard ../work/models/${PIVOT}-*${SRC}*),)
MODELHOME = $(firstword $(wildcard ../work/models/*${PIVOT}*-*${SRC}*))
else
MODELHOME = $(firstword $(wildcard ../work/models/${PIVOT}-*${SRC}*))
endif
else
MODELHOME = $(firstword $(wildcard ../work/models/*${PIVOT}*-${SRC}))
endif
else
MODELHOME = ../work/models/${PIVOT_LANGPAIR}
endif
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
## set to 1 if the model for backtranslation is a multi-target model
## --> need to use pre-processing script differently
ifeq ($(words $(subst +, ,$(lastword $(subst -, ,$(notdir ${MODELHOME}))))),1)
MULTI_TARGET_MODEL = 0
else
MULTI_TARGET_MODEL = 1
endif
## decoder config
DECODER = ${OUTPUT_DIR}/${MODELNAME}/decoder.yml
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
#--------------------------------------------------------------------------
# target files to be created
#--------------------------------------------------------------------------
## just one data set
TRANSLATED_SRC = ${OUTPUT_DIR}/${DATASET_NAME}.${MODELNAME}.${LANGPAIR}.${PIVOT}.gz
TRANSLATED_PRE = ${OUTPUT_DIR}/${DATASET_NAME}.${MODELNAME}.${LANGPAIR}.${PIVOT}.spm.gz
TRANSLATED_TRG = ${OUTPUT_DIR}/${DATASET_NAME}.${MODELNAME}.${LANGPAIR}.${SRC}.gz
TRANSLATED_LATEST_SRC = ${OUTPUT_DIR}/latest/${DATASET_NAME}.${LANGPAIR}.${SRC}.gz
TRANSLATED_LATEST_TRG = ${OUTPUT_DIR}/latest/${DATASET_NAME}.${LANGPAIR}.${TRG}.gz
## all data sets
ALL_TRANSLATED_SRC = $(patsubst %,${OUTPUT_DIR}/%.${MODELNAME}.${LANGPAIR}.${PIVOT}.gz,${DATASET_NAMES})
ALL_TRANSLATED_TRG = $(patsubst %,${OUTPUT_DIR}/%.${MODELNAME}.${LANGPAIR}.${SRC}.gz,${DATASET_NAMES})
ALL_TRANSLATED_LATEST_SRC = $(patsubst %,${OUTPUT_DIR}/latest/%.${LANGPAIR}.${SRC}.gz,${DATASET_NAMES})
ALL_TRANSLATED_LATEST_TRG = $(patsubst %,${OUTPUT_DIR}/latest/%.${LANGPAIR}.${TRG}.gz,${DATASET_NAMES})
# .INTERMEDIATE: ${TRANSLATED_PRE}
.PHONY: all prepare translate
all: ${DECODER} ${ALL_TRANSLATED_LATEST_SRC} ${ALL_TRANSLATED_LATEST_TRG}
prepare: ${TRANSLATED_PRE}
translate: ${TRANSLATED_TRG}
## aux function to print the selected modelname and data sets
.PHONY: print-modelname
print-modelname:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo "${sort ${wildcard ${MODELHOME}/*-20*.zip}}"
.PHONY: print-data
print-data:
@echo ${ORIGINAL_DATASRC}
@echo ${DATASET_NAME}
@echo ${TRANSLATED_SRC}
@echo ${TRANSLATED_TRG}
@echo ${TRANSLATED_LATEST_SRC}
@echo ${TRANSLATED_LATEST_TRG}
.PHONY: print-all-data
print-all-data:
@echo "${ORIGINAL_DATASETS_SRC}"
@echo "${DATASET_NAMES}"
@echo "${ALL_TRANSLATED_SRC}"
@echo "${ALL_TRANSLATED_TRG}"
@echo "${ALL_TRANSLATED_LATEST_SRC}"
@echo "${ALL_TRANSLATED_LATEST_TRG}"
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
${OUTPUT_DIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
cp ${MODELZIP} ${dir $@}
cd ${dir $@} && unzip *.zip
endif
#--------------------------------------------------------------------------
## pre-process data
#--------------------------------------------------------------------------
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${PIVOT} ${SRC} ${OUTPUT_DIR}/${MODELNAME}/source.spm
else
PREPROCESS_ARGS = ${PIVOT} ${OUTPUT_DIR}/${MODELNAME}/source.spm
endif
${TRANSLATED_PRE}: ${ORIGINAL_DATASRC}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${DECODER}
zcat $< |\
${OUTPUT_DIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
gzip -c > $@
endif
# the same in an implict rule (makes it possible to run things in parallel)
${OUTPUT_DIR}/%.${MODELNAME}.${LANGPAIR}.${PIVOT}.spm.gz: ${ORIGINAL_DATADIR}/${PRE}/%.clean.${PIVOT}.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${DECODER}
zcat $< |\
${OUTPUT_DIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
gzip -c > $@
endif
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${OUTPUT_DIR}/latest/%.${LANGPAIR}.${SRC}.gz: ${OUTPUT_DIR}/%.${MODELNAME}.${LANGPAIR}.${SRC}.gz
mkdir -p ${dir $@}
cp $< $@
${OUTPUT_DIR}/latest/%.${LANGPAIR}.${TRG}.gz: ${ORIGINAL_DATADIR}/${PRE}/%.clean.${TRG}.gz \
${OUTPUT_DIR}/latest/%.${LANGPAIR}.${SRC}.gz
mkdir -p ${dir $@}
cp $< $@
${TRANSLATED_LATEST_TRG}: ${ORIGINAL_DATATRG} ${TRANSLATED_LATEST_SRC}
mkdir -p ${dir $@}
cp $< $@
## translate
%.${LANGPAIR}.${SRC}.gz: %.${LANGPAIR}.${PIVOT}.spm.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${OUTPUT_DIR}/${MODELNAME}/decoder.yml
${LOADMODS} && cd ${OUTPUT_DIR}/${MODELNAME} && ${MARIAN}/marian-decoder \
-i ${PWD}/$< \
-c decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
endif
check-length:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
echo "check $$d"; \
for S in `ls $$d/*.$$s.gz`; do \
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
echo "$$S -- $$T"; \
zcat $$S | wc -l; \
zcat $$T | wc -l; \
if [ `zcat $$S | wc -l` != `zcat $$T | wc -l` ]; then \
echo "$$S != $$T"; \
fi \
done \
done

View File

@ -15,6 +15,8 @@ parser.add_argument('-l','--supported','--supported-languages', action='store_tr
help='list all supported languages')
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
help='show whether languages are supported')
parser.add_argument('-v','--verbose', action='store_true',
help='verbose output')
args = parser.parse_args()
@ -27,15 +29,21 @@ def supported_language(lang):
def is_accepted(line,accept,reject):
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
# isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
if accept:
isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=accept, bestEffort=True)
if details[0][1] == accept:
if isReliable:
return True
if args.verbose:
print("language mismatch: " + details[0][1] + " != " + accept + ", " + line, file=sys.stderr)
else:
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
if details[0][1] != reject:
return True
if args.verbose:
print("reject because detected: " + details[0][1] + ", " + line, file=sys.stderr)
if args.supported:

View File

@ -14,6 +14,8 @@ parser.add_argument('-s','--supported','--supported-languages', action='store_tr
help='list all supported languages')
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
help='show whether languages are supported')
parser.add_argument('-v','--verbose', action='store_true',
help='verbose output')
args = parser.parse_args()
def supported_language(lang):
@ -25,9 +27,9 @@ def supported_language(lang):
def is_accepted(line,accept,reject):
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
# isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
if accept:
isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=accept, bestEffort=True)
if details[0][1] == accept:
if isReliable:
# print("ACCEPT")
@ -41,7 +43,10 @@ def is_accepted(line,accept,reject):
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
if args.verbose:
print("language mismatch: " + details[0][1] + " != " + accept + ", " + line, file=sys.stderr)
else:
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
if details[0][1] != reject:
# print("ACCEPT")
# print(details)
@ -50,6 +55,9 @@ def is_accepted(line,accept,reject):
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
if args.verbose:
print("reject because detected: " + details[0][1] + ", " + line, file=sys.stderr)