mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-28 06:09:35 +03:00
backtranslation data for multilingual models
This commit is contained in:
parent
3bc480db1b
commit
93f03a1fe7
@ -289,9 +289,9 @@ ifeq (${GPU},p100)
|
||||
else ifeq (${GPU},v100)
|
||||
# MARIAN_WORKSPACE = 30000
|
||||
# MARIAN_WORKSPACE = 26000
|
||||
# MARIAN_WORKSPACE = 24000
|
||||
MARIAN_WORKSPACE = 24000
|
||||
# MARIAN_WORKSPACE = 18000
|
||||
MARIAN_WORKSPACE = 16000
|
||||
# MARIAN_WORKSPACE = 16000
|
||||
else
|
||||
MARIAN_WORKSPACE = 10000
|
||||
endif
|
||||
|
@ -26,8 +26,23 @@ ifneq (${wildcard scripts/cleanup/${TRG}},)
|
||||
endif
|
||||
|
||||
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
## back translation data
|
||||
|
||||
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
|
||||
|
||||
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}}
|
||||
BACKTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${BACKTRANS_SRC}}
|
||||
|
||||
|
||||
## data sets (train/dev/test)
|
||||
|
||||
ifeq (${USE_BACKTRANS},1)
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} ${BACKTRANS_SRC}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
else
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
endif
|
||||
|
||||
CLEAN_TUNE_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TUNESET}}
|
||||
CLEAN_TUNE_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TUNE_SRC}}
|
||||
@ -43,11 +58,6 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN
|
||||
|
||||
|
||||
|
||||
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
|
||||
|
||||
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRC}.gz}}
|
||||
BACKTRANS_TRG = ${patsubst %.${SRC}.gz,%.${TRG}.gz,${BACKTRANS_SRC}}
|
||||
|
||||
## make data in reverse direction without re-doing word alignment etc ...
|
||||
## ---> this is dangerous when things run in parallel
|
||||
## ---> only works for bilingual models
|
||||
@ -56,6 +66,10 @@ REV_LANGSTR = ${subst ${SPACE},+,$(TRGLANGS)}-${subst ${SPACE},+,$(SRCLANGS)}
|
||||
REV_WORKDIR = ${WORKHOME}/${REV_LANGSTR}
|
||||
|
||||
|
||||
ttt:
|
||||
echo ${CLEAN_TRAIN_SRC}
|
||||
echo ${CLEAN_TRAIN_TRG}
|
||||
|
||||
reverse-data:
|
||||
ifeq (${PRE_SRC},${PRE_TRG})
|
||||
ifeq (${words ${SRCLANGS}},1)
|
||||
@ -982,6 +996,8 @@ SPMEXTRA =
|
||||
|
||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
|
||||
GENERATE_SPM_VOC = 0
|
||||
|
||||
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
||||
@ -1002,7 +1018,9 @@ endif
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
mv $@.model $@
|
||||
ifeq (${GENERATE_SPM_VOC},1)
|
||||
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
|
||||
endif
|
||||
rm -f $<.text
|
||||
else
|
||||
@echo "$@ already exists!"
|
||||
@ -1028,7 +1046,9 @@ ifeq ($(wildcard ${SPMTRGMODEL}),)
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
mv $@.model $@
|
||||
ifeq (${GENERATE_SPM_VOC},1)
|
||||
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
|
||||
endif
|
||||
rm -f $<.text
|
||||
else
|
||||
@echo "$@ already exists!"
|
||||
|
@ -129,12 +129,13 @@ ifeq (${wildcard ${BT_MODEL_START}},)
|
||||
endif
|
||||
endif
|
||||
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
|
||||
${MAKE} DATASET=${DATASET}+bt \
|
||||
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
||||
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
||||
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 \
|
||||
MARIAN_EARLY_STOPPING=15 \
|
||||
${@:-bt=}
|
||||
|
||||
# CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
||||
# CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -333,7 +333,7 @@ fiskmo-svfi-%:
|
||||
|
||||
celtic-english:
|
||||
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kv gv" TRGLANGS=en train-dynamic
|
||||
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en train-dynamic
|
||||
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic
|
||||
|
||||
|
||||
LANGS_FR_VARIANTS = fr_BE fr_CA fr_FR
|
||||
|
10
NOTES.md
10
NOTES.md
@ -1,5 +1,15 @@
|
||||
|
||||
|
||||
further resources: (from http://techiaith.cymru/translation/demo/?lang=en)
|
||||
contact: Dewi Jones (d.b.jones@bangor.ac.uk)
|
||||
|
||||
http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
|
||||
http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
|
||||
http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
|
||||
http://techiaith.cymru/alinio/rhestr_geiriau.tsv
|
||||
|
||||
(see work/data/cy-en)
|
||||
|
||||
|
||||
|
||||
# celtic languages
|
||||
|
@ -102,6 +102,19 @@ all-wikilangs: index.html
|
||||
done
|
||||
|
||||
|
||||
|
||||
## for Breton: use the multilingual celtic model to backtranslate
|
||||
breton:
|
||||
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis
|
||||
|
||||
## do the same for all Celtic languages in the model
|
||||
celtic:
|
||||
for l in ga cy br gd kv gv; do \
|
||||
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
|
||||
focus-wikis:
|
||||
for l in tl bcl ml bn mn; do \
|
||||
|
@ -134,7 +134,7 @@ tmx-tune:
|
||||
mkdir -p $$s-$$t; \
|
||||
paste ${TMXBASE}.*-*.$$s ${TMXBASE}.*-*.$$t | \
|
||||
sort | uniq | \
|
||||
python3 ../bitext-match-lang.py -s $$s -t $$t | \
|
||||
python3 ../scripts/filter/bitext-match-lang.py -s $$s -t $$t | \
|
||||
grep -v '[<>{}]' |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
|
Loading…
Reference in New Issue
Block a user