backtranslation data for multilingual models

This commit is contained in:
Joerg Tiedemann 2020-03-24 23:47:57 +02:00
parent 3bc480db1b
commit 93f03a1fe7
7 changed files with 58 additions and 14 deletions

View File

@ -289,9 +289,9 @@ ifeq (${GPU},p100)
else ifeq (${GPU},v100)
# MARIAN_WORKSPACE = 30000
# MARIAN_WORKSPACE = 26000
# MARIAN_WORKSPACE = 24000
MARIAN_WORKSPACE = 24000
# MARIAN_WORKSPACE = 18000
MARIAN_WORKSPACE = 16000
# MARIAN_WORKSPACE = 16000
else
MARIAN_WORKSPACE = 10000
endif

View File

@ -26,8 +26,23 @@ ifneq (${wildcard scripts/cleanup/${TRG}},)
endif
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
## back translation data
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}}
BACKTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${BACKTRANS_SRC}}
## data sets (train/dev/test)
ifeq (${USE_BACKTRANS},1)
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} ${BACKTRANS_SRC}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
else
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
endif
CLEAN_TUNE_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TUNESET}}
CLEAN_TUNE_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TUNE_SRC}}
@ -43,11 +58,6 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRC}.gz}}
BACKTRANS_TRG = ${patsubst %.${SRC}.gz,%.${TRG}.gz,${BACKTRANS_SRC}}
## make data in reverse direction without re-doing word alignment etc ...
## ---> this is dangerous when things run in parallel
## ---> only works for bilingual models
@ -56,6 +66,10 @@ REV_LANGSTR = ${subst ${SPACE},+,$(TRGLANGS)}-${subst ${SPACE},+,$(SRCLANGS)}
REV_WORKDIR = ${WORKHOME}/${REV_LANGSTR}
ttt:
echo ${CLEAN_TRAIN_SRC}
echo ${CLEAN_TRAIN_TRG}
reverse-data:
ifeq (${PRE_SRC},${PRE_TRG})
ifeq (${words ${SRCLANGS}},1)
@ -982,6 +996,8 @@ SPMEXTRA =
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
GENERATE_SPM_VOC = 0
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
ifeq ($(wildcard ${SPMSRCMODEL}),)
@ -1002,7 +1018,9 @@ endif
--character_coverage=1.0 --hard_vocab_limit=false; \
fi
mv $@.model $@
ifeq (${GENERATE_SPM_VOC},1)
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
endif
rm -f $<.text
else
@echo "$@ already exists!"
@ -1028,7 +1046,9 @@ ifeq ($(wildcard ${SPMTRGMODEL}),)
--character_coverage=1.0 --hard_vocab_limit=false; \
fi
mv $@.model $@
ifeq (${GENERATE_SPM_VOC},1)
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
endif
rm -f $<.text
else
@echo "$@ already exists!"

View File

@ -129,12 +129,13 @@ ifeq (${wildcard ${BT_MODEL_START}},)
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+bt \
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 \
MARIAN_EARLY_STOPPING=15 \
${@:-bt=}
# CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
# CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \

View File

@ -333,7 +333,7 @@ fiskmo-svfi-%:
celtic-english:
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kv gv" TRGLANGS=en train-dynamic
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en train-dynamic
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic
LANGS_FR_VARIANTS = fr_BE fr_CA fr_FR

View File

@ -1,5 +1,15 @@
further resources: (from http://techiaith.cymru/translation/demo/?lang=en)
contact: Dewi Jones (d.b.jones@bangor.ac.uk)
http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
http://techiaith.cymru/alinio/rhestr_geiriau.tsv
(see work/data/cy-en)
# celtic languages

View File

@ -102,6 +102,19 @@ all-wikilangs: index.html
done
## for Breton: use the multilingual celtic model to backtranslate
breton:
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis
## do the same for all Celtic languages in the model
celtic:
for l in ga cy br gd kv gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
done
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
focus-wikis:
for l in tl bcl ml bn mn; do \

View File

@ -134,7 +134,7 @@ tmx-tune:
mkdir -p $$s-$$t; \
paste ${TMXBASE}.*-*.$$s ${TMXBASE}.*-*.$$t | \
sort | uniq | \
python3 ../bitext-match-lang.py -s $$s -t $$t | \
python3 ../scripts/filter/bitext-match-lang.py -s $$s -t $$t | \
grep -v '[<>{}]' |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\