translate with backtranslations

This commit is contained in:
Joerg Tiedemann 2020-05-13 00:41:07 +03:00
parent e4455e510a
commit 7ef908dcd7
6 changed files with 142 additions and 55 deletions

View File

@ -198,6 +198,42 @@ all: ${WORKDIR}/config.mk
${MAKE} eval
${MAKE} compare
## TODO: does not look good to remove index.html from backtranlsation dir
## but we need to refresh the file from time to time (new wiki packages!)
.PHONY: all-and-backtranslate
all-and-backtranslate: ${WORKDIR}/config.mk
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
rm -f backtranslate/index.html
${MAKE} -C backtranslate index.html
${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} all
.PHONY: all-and-backtranslate-allwikis
all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
rm -f backtranslate/index.html
${MAKE} -C backtranslate index.html
-${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} all-wikitext
${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} translate-all-wikis
.PHONY: all-with-bt
all-with-bt:
${MAKE} all
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate
${MAKE} all-bt
.PHONY: all-job
all-job: ${WORKDIR}/config.mk
${MAKE} data
@ -244,8 +280,6 @@ wordalign: ${TRAIN_ALG}
## other model types
vocab: ${MODEL_VOCAB}
train: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
translate: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
eval: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.eval
compare: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
@ -256,3 +290,18 @@ eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.
## combined tasks:
## train and evaluate
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
## train model and start back-translation jobs once the model is ready
## (requires to create a dist package)
train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
${MAKE} local-dist
${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs

View File

@ -27,6 +27,7 @@ PART = aa
LANGPAIR = ${SRC}-${TRG}
PWD := $(shell pwd)
MODELHOME = ../models/${LANGPAIR}
## standard sort is different from UTF8-based sort
@ -101,16 +102,27 @@ all: index.html
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit"; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 submit-translate-all-parts; \
echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
${MAKE} WIKISOURCE=$$w translate-all-parts; \
fi \
done
translate-all-wikis-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate-all-parts-jobs; \
fi \
done
all-wikitext:
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
@ -410,7 +422,7 @@ translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
done
## create jobs for translating all parts
submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
${MAKE} PART=$$p translate.submit; \
done

View File

@ -210,9 +210,10 @@ endif
## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training
DATADIR = ${WORKHOME}/data
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
SPMDIR = ${WORKHOME}/SentencePieceModels
DATADIR = ${WORKHOME}/data
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
MODELDIR = ${WORKHOME}/models/${LANGPAIRSTR}
SPMDIR = ${WORKHOME}/SentencePieceModels
## data sets
TRAIN_BASE = ${WORKDIR}/train/${DATASET}

View File

@ -91,6 +91,7 @@ ifeq (${words ${TRGLANGS}},1)
mkdir -p ${REV_WORKDIR}/train; \
ln -s ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${REV_WORKDIR}/train/${notdir ${TRAIN_TRG}.clean.${PRE_TRG}.gz}; \
ln -s ${TRAIN_TRG}.clean.${PRE_TRG}.gz ${REV_WORKDIR}/train/${notdir ${TRAIN_SRC}.clean.${PRE_SRC}.gz}; \
cp ${WORKDIR}/train/README.md ${REV_WORKDIR}/train/README.md; \
fi
-if [ -e ${SPMSRCMODEL} ]; then \
ln -s ${SPMSRCMODEL} ${REV_WORKDIR}/train/${notdir ${SPMTRGMODEL}}; \
@ -115,15 +116,22 @@ ifeq (${words ${TRGLANGS}},1)
ln -s ${DEV_SRC}.shuffled.gz ${REV_WORKDIR}/val/${notdir ${DEV_SRC}.shuffled.gz}; \
ln -s ${DEV_SRC}.notused.gz ${REV_WORKDIR}/val/${notdir ${DEV_TRG}.notused.gz}; \
ln -s ${DEV_TRG}.notused.gz ${REV_WORKDIR}/val/${notdir ${DEV_SRC}.notused.gz}; \
cp ${WORKDIR}/val/README.md ${REV_WORKDIR}/val/README.md; \
fi
-if [ -e ${TEST_SRC} ]; then \
mkdir -p ${REV_WORKDIR}/test; \
ln -s ${TEST_SRC} ${REV_WORKDIR}/test/${notdir ${TEST_TRG}}; \
ln -s ${TEST_TRG} ${REV_WORKDIR}/test/${notdir ${TEST_SRC}}; \
cp ${WORKDIR}/test/README.md ${REV_WORKDIR}/test/README.md; \
fi
-if [ -e ${MODEL_VOCAB} ]; then \
ln -s ${MODEL_VOCAB} ${REV_WORKDIR}/${notdir ${MODEL_VOCAB}}; \
fi
-if [ -e ${WORKDIR}/config.mk ]; then \
if [ ! -e ${REV_WORKDIR}/config.mk ]; then \
cp ${WORKDIR}/config.mk ${REV_WORKDIR}/config.mk; \
fi \
fi
endif
endif
endif

View File

@ -11,9 +11,24 @@ DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
## minimum BLEU score for models to be accepted as distribution package
MIN_BLEU_SCORE = 20
.PHONY: dist
.PHONY: dist local-dist global-dist release
dist: ${DIST_PACKAGE}
## local distribution in workhome, no restrictions about BLEU
local-dist:
${MAKE} MODELSHOME=${WORKHOME}/models \
MODELS_URL=https://object.pouta.csc.fi/OPUS-MT-dev \
dist
## global distribution in models-dir, restrictions on BLEU
global-dist release:
if [ `grep BLEU $(TEST_EVALUATION) | cut -f3 -d ' ' | cut -f1 -d '.'` -ge ${MIN_BLEU_SCORE} ]; then \
${MAKE} MODELSHOME=${PWD}/models \
MODELS_URL=https://object.pouta.csc.fi/OPUS-MT-models
dist; \
fi
.PHONY: scores
scores:
${MAKE} FIND_EVAL_FILES=1 ${WORKHOME}/eval/scores.txt
@ -22,14 +37,15 @@ scores:
## get the best model from all kind of alternative setups
## in the following sub directories (add prefix work-)
## scan various work directories - specify alternative dir's below
ALT_MODEL_BASE = work-
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
# ALT_MODEL_DIR = spm langid
ALT_MODEL_DIR = langid
best_dist_all:
.PHONY: best_dist_all best-dist-all
best-dist-all best_dist_all:
for l in $(sort ${shell ls ${ALT_MODEL_BASE}* | grep -- '-' | grep -v old | grep -v work}); do \
if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
@ -40,14 +56,6 @@ best_dist_all:
done
# best_dist_all:
# for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
# if [ `find work*/$$l -name '${DATASET}${TRAINSIZE}.*.npz' | wc -l` -gt 0 ]; then \
# ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
# TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" best_dist; \
# fi \
# done
## find the best model according to test set scores
@ -55,43 +63,12 @@ best_dist_all:
## (BLEU needs to be above MIN_BLEU_SCORE)
## NEW: don't trust models tested with GNOME test sets!
## OLD version of finding the best model
## --> this didn't properly look at different variants in the same folder
best_dist_old:
@m=0;\
s=''; \
echo "------------------------------------------------"; \
echo "search best model for ${LANGPAIRSTR}"; \
for d in ${ALT_MODEL_DIR}; do \
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
echo "evaldata = $$e"; \
if [ "$$e" != "GNOME" ]; then \
if ls work-$$d/${LANGPAIRSTR}/$$e*.eval 1> /dev/null 2>&1; then \
b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e*.eval | cut -f3 -d' '`; \
if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \
echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \
m=$$b; \
s=$$d; \
else \
echo "$$d ($$b) is worse than $$s ($$m)!"; \
fi \
fi \
fi \
done; \
echo "------------------------------------------------"; \
if [ "$$s" != "" ]; then \
if (( $$(echo "$$m > ${MIN_BLEU_SCORE}" |bc -l) )); then \
${MAKE} MODELSHOME=${PWD}/models \
MODELS_URL=https://object.pouta.csc.fi/OPUS-MT-models dist-$$s; \
fi; \
fi
## new version of finding the best model
## --> look at different model variants in each work-dir
## --> take only the best one to publish
best_dist:
.PHONY: best_dist best-dist
best-dist best_dist:
@m=0;\
s=''; \
echo "------------------------------------------------"; \
@ -273,6 +250,7 @@ endif
# - make upload-eval .... benchmark tests from models in WORKHOME
# - make upload-images .. images of VMs that run OPUS-MT
.PHONY: upload
upload:
find models/ -type l | tar -cf models-links.tar -T -
find models/ -type l -delete
@ -285,6 +263,7 @@ upload:
rm -f index.txt
.PHONY: upload-models
upload-models:
find ${WORKHOME}/models -type l | tar -cf dev-models-links.tar -T -
find ${WORKHOME}/models -type l -delete
@ -296,14 +275,17 @@ upload-models:
swift upload OPUS-MT-dev index.txt
rm -f index.txt
.PHONY: upload-scores
upload-scores: scores
cd ${WORKHOME} && swift upload OPUS-MT-eval --changed --skip-identical eval/scores.txt
swift post OPUS-MT-eval --read-acl ".r:*"
.PHONY: upload-eval
upload-eval: scores
cd ${WORKHOME} && swift upload OPUS-MT-eval --changed --skip-identical eval
swift post OPUS-MT-eval --read-acl ".r:*"
.PHONY: upload-images
upload-images:
cd ${WORKHOME} && swift upload OPUS-MT --changed --skip-identical \
--use-slo --segment-size 5G opusMT-images

View File

@ -35,7 +35,7 @@ OPUSLANGS = fi sv fr es de ar he "cmn cn yue ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW
allopus2pivot:
for l in ${filter-out ${PIVOT},${OPUSLANGS}}; do \
${MAKE} WALLTIME=72 SRCLANGS="$$l" bilingual-dynamic; \
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=${PIVOT} bilingual-dynamic; \
done
## this looks dangerous ....
@ -52,3 +52,38 @@ allopus:
all2en:
${MAKE} PIVOT=en allopus2pivot
allopus2pivot-small:
for l in $(sort ${filter-out ${PIVOT},${OPUSLANGS}}); do \
${MAKE} SRCLANGS="$$l" TRGLANGS=${PIVOT} local-config; \
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=${PIVOT} train-if-small; \
done
train-if-small:
if [ ${BPESIZE} -lt 12000 ]; then \
${MAKE} data; \
${MAKE} train-and-eval-job; \
${MAKE} reverse-data; \
${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' train-and-eval-job; \
fi
## make models with backtranslations in both directions
## for English-to-other language models
##
## --> xx-to-English-to backtranslations is on al parts of all wikis
## --> English-to-xx backtranslations is only one part of wikipedia
##
## NOTE: this does not work for multilingual models!
opus-enxx:
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} all-and-backtranslate-allwikis
${MAKE} all-and-backtranslate-bt
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} all-bt