use only latest backtranslation

This commit is contained in:
Joerg Tiedemann 2020-04-01 20:18:06 +03:00
parent 24fd67cc99
commit f508bb4df6
5 changed files with 104 additions and 30 deletions

View File

@ -39,8 +39,14 @@ endif
## back translation data
## - use only the latest backtranslations
## if such a subdir exists
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
ifneq (${wildcard backtranslate/${TRG}-${SRC}/latest},)
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}/latest
else
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
endif
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}}
BACKTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${BACKTRANS_SRC}}

View File

@ -129,3 +129,12 @@ SNMTPATH = ${APPLHOME}/subword-nmt/subword_nmt
## SentencePiece
SPM_HOME = ${MARIANSPM}
# SORT = sort -T ${TMPDIR} -S 50% --parallel=${THREADS}
SORT = sort -T ${TMPDIR} --parallel=${THREADS}
SHUFFLE = ${shell which terashuf 2>/dev/null}
ifeq (${SHUFFLE},)
SHUFFLE = ${SORT} --random-sort
endif

View File

@ -98,7 +98,7 @@ train-dynamic:
echo "${LANGPAIRSTR} bigger than 1 million"; \
${MAKE} \
MARIAN_VALID_FREQ=2500 \
HPC_CORES=1 HPC_MEM=4g train.submit; \
HPC_CORES=1 HPC_MEM=8g train.submit; \
elif [ $$s -gt 100000 ]; then \
echo "${LANGPAIRSTR} bigger than 100k"; \
${MAKE} \
@ -278,10 +278,10 @@ fiskmo-svfi-%:
## only OPUS data
%-celtic-english-opus:
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kv gv" TRGLANGS=en ${@:-celtic-english-opus=}
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kw gv" TRGLANGS=en ${@:-celtic-english-opus=}
%-english-celtic-opus:
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en ${@:-english-celtic-opus=}
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en ${@:-english-celtic-opus=}
# more data for cy-en
@ -289,13 +289,13 @@ fiskmo-svfi-%:
%-celtic-english: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
SRCLANGS="ga cy br gd kv gv" TRGLANGS=en \
SRCLANGS="ga cy br gd kw gv" TRGLANGS=en \
${@:-celtic-english=}
%-english-celtic: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en \
TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en \
${@:-english-celtic=}

View File

@ -23,7 +23,7 @@ ga gle Irish yes (ga)
cy wel/cym Welsh yes (cy)
br bre bre/xbm/obt Breton yes (br)
gd gla Scottish Gaelic yes (gd)
kw cor cor/cnx/oco Cornish yes (kv)
kw cor cor/cnx/oco Cornish yes (kw)
gv glv Manx yes (gv)
```

View File

@ -29,9 +29,15 @@ LANGPAIR = ${SRC}-${TRG}
MODELHOME = ../models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
## standard sort is different from UTF8-based sort
## --> prefer models with augmented data sets (separated by +)
## we need the UTF8 sort order
## --> use bash sort and UTF8 locale
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-langid/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
@ -39,6 +45,10 @@ ifeq (${MODELNAME},)
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
@ -54,6 +64,10 @@ WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
WIKI_LATEST_SRC = ${LANGPAIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
WIKI_LATEST_TRG = ${LANGPAIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
@ -79,7 +93,7 @@ UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${
all: index.html
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
${MAKE} ${WIKI_LATEST_SRC} ${WIKI_LATEST_TRG}
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
@ -106,16 +120,53 @@ all-wikilangs: index.html
done
## aux function to pring the selected modelname
.PHONY: print-modelname
print-modelname:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo "${sort ${wildcard ${MODELHOME}/*-20*.zip}}"
fetch-celtic:
for l in ga cy br gd kw gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikitext; \
done
## translate celtic languages using our multilingual model
## in both directions
translate-celtic-english:
for l in ga cy br gd kw gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis; \
done
translate-english-celtic:
for l in ga cy br gd kw gv; do \
${MAKE} TRG=$$l SRC=en \
MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
MULTI_TARGET_MODEL=1 \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
done
# test-celtic:
# for l in ga cy br gd kw gv; do \
# ${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en print-modelname; \
# done
## for Breton: use the multilingual celtic model to backtranslate
breton:
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis
## do the same for all Celtic languages in the model
celtic:
for l in ga cy br gd kv gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
done
small-romance:
for l in wa frp oc ca rm lld fur lij lmo gl lad an mwl co nap scn vec sc la; do \
@ -182,12 +233,9 @@ giellatekno/se: giellatekno/sme
translate-sami: translate-sami-wiki translate-sami-corp
translate-sami-corp: sami-corp giellatekno/se
d=`date "+%Y-%m-%d"`; \
for s in se sma smn sms smj; do \
for t in se sma smn sms smj et fi kv krl nb no nn ru sv en; do \
if [ "$$s" != "$$t" ]; then \
mkdir -p $$s-$$t/$$d; \
mv $$s-$$t/* $$s-$$t/$$d/; \
${MAKE} SRC=$$s TRG=$$t \
WIKI_DIR=giellatekno/$$s \
WIKISOURCE=corp \
@ -200,12 +248,9 @@ translate-sami-corp: sami-corp giellatekno/se
done
translate-sami-wiki:
d=`date "+%Y-%m-%d"`; \
for s in se; do \
for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
if [ "$$s" != "$$t" ]; then \
mkdir -p $$s-$$t/$$d; \
mv $$s-$$t/* $$s-$$t/$$d/; \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
@ -214,12 +259,9 @@ translate-sami-wiki:
fi \
done \
done
d=`date "+%Y-%m-%d"`; \
for s in no nn ru sv en; do \
for t in se sma smn sms smj; do \
if [ "$$s" != "$$t" ]; then \
mkdir -p $$s-$$t/$$d; \
mv $$s-$$t/* $$s-$$t/$$d/; \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
@ -239,7 +281,7 @@ get-data: ${WIKI_JSON}
extract-text: ${WIKI_TXT}
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
prepare-data: ${WIKI_PRE}
translate: ${WIKI_SRC} ${WIKI_TRG}
translate: ${WIKI_LATEST_SRC} ${WIKI_LATEST_TRG}
## translate all parts
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
@ -309,6 +351,21 @@ ifneq (${MODELZIP},)
endif
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${WIKI_LATEST_SRC}: ${WIKI_SRC}
mkdir -p ${dir $@}
cp $< $@
${WIKI_LATEST_TRG}: ${WIKI_TRG}
mkdir -p ${dir $@}
cp $< $@
## translate
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
@ -322,11 +379,11 @@ ifneq (${MODELZIP},)
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
ifneq (${LANGPAIR},)
ifneq (${MODELNAME},)
rm -fr ${LANGPAIR}/${MODELNAME}
endif
endif
#ifneq (${LANGPAIR},)
#ifneq (${MODELNAME},)
# rm -fr ${LANGPAIR}/${MODELNAME}
#endif
#endif
endif
@ -363,10 +420,12 @@ ${WIKI_TXT}: ${WIKI_JSON}
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
${SORT} -u | ${SHUFFLE} |\
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}*
# $(TOKENIZER)/normalize-punctuation.perl |\