mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
added option for ce-filtered forward translations
This commit is contained in:
parent
a08ad41fbd
commit
df1b4b2942
@ -740,6 +740,8 @@ ${WORKDIR}/${MODELCONFIG}:
|
||||
@echo "TESTSET = ${TESTSET}" >> $@
|
||||
@echo "PRE = ${PRE}" >> $@
|
||||
@echo "SUBWORDS = ${SUBWORDS}" >> $@
|
||||
@echo "MODEL_SRCVOCAB = ${MODEL_SRCVOCAB}" >> $@
|
||||
@echo "MODEL_TRGVOCAB = ${MODEL_TRGVOCAB}" >> $@
|
||||
ifdef SHUFFLE_DATA
|
||||
@echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@
|
||||
endif
|
||||
|
@ -82,6 +82,13 @@ ifeq (${USE_FORWARDTRANS},1)
|
||||
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
|
||||
endif
|
||||
|
||||
# forward-translation data (source-to-target)
|
||||
# filtered by reconstruction scores (ce filter)
|
||||
ifneq (${USE_FORWARDTRANS_SELECTED},)
|
||||
FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
|
||||
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
|
||||
endif
|
||||
|
||||
# forward-translation data of monolingual data (source-to-target)
|
||||
ifeq (${USE_FORWARDTRANSMONO},1)
|
||||
FORWARDTRANSMONO_SRC = ${sort ${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.gz}}
|
||||
|
@ -321,6 +321,29 @@ endif
|
||||
# MODELCONFIG=config-ft.mk \
|
||||
|
||||
|
||||
|
||||
# use a selected set of forward translation
|
||||
|
||||
# default for ce-filter
|
||||
FT_SELECTED ?= 95
|
||||
|
||||
%-ftbest:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \
|
||||
${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \
|
||||
RETAIN=${FT_SELECTED} extract-best-translations; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
${MAKE} DATASET=${DATASET}+ft${FT_SELECTED} \
|
||||
USE_FORWARDTRANS_SELECTED=${FT_SELECTED} \
|
||||
${@:-ftbest=}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## add forward translation of monolingual data
|
||||
%-ftmono:
|
||||
${MAKE} DATASET=${DATASET}+ftmono USE_FORWARDTRANSMONO=1 ${@:-ftmono=}
|
||||
@ -340,8 +363,9 @@ endif
|
||||
## don't use the regular parallel training data
|
||||
## (only makes sense if bt, ft, or pivot-based data are activated)
|
||||
%-nopar:
|
||||
${MAKE} DATASET=${DATASET}+nopar TRAINSET= TATOEBA_TRAINSET= ${@:-nopar=}
|
||||
${MAKE} DATASET=${DATASET}+nopar TRAINSET= ${@:-nopar=}
|
||||
|
||||
# TATOEBA_TRAINSET=
|
||||
|
||||
##-------------------------------------------------------------
|
||||
## default: make separate sentencepiece models
|
||||
@ -371,9 +395,10 @@ endif
|
||||
USE_FORWARDTRANS=1 \
|
||||
CONTINUE_EXISTING=1 \
|
||||
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
|
||||
TRAINSET= TATOEBA_TRAINSET= \
|
||||
TRAINSET= \
|
||||
${@:-ftonly=}
|
||||
|
||||
# TATOEBA_TRAINSET= \
|
||||
# MODELCONFIG=config-ft.mk \
|
||||
|
||||
|
||||
|
@ -20,6 +20,9 @@ endif
|
||||
|
||||
SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR})
|
||||
|
||||
## comma separated nodes to be excluded
|
||||
# BROKEN_NODES = g6301
|
||||
|
||||
%.submit:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
@ -36,6 +39,9 @@ endif
|
||||
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
|
||||
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@
|
||||
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
|
||||
ifdef BROKEN_NODES
|
||||
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
|
||||
endif
|
||||
echo '${HPC_EXTRA}' >> $@
|
||||
echo '${HPC_EXTRA1}' >> $@
|
||||
echo '${HPC_EXTRA2}' >> $@
|
||||
@ -73,6 +79,9 @@ endif
|
||||
echo '#SBATCH -N ${HPC_NODES}' >> $@
|
||||
echo '#SBATCH -p ${HPC_QUEUE}' >> $@
|
||||
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
|
||||
ifdef BROKEN_NODES
|
||||
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
|
||||
endif
|
||||
echo '${HPC_EXTRA}' >> $@
|
||||
echo '${HPC_EXTRA1}' >> $@
|
||||
echo '${HPC_EXTRA2}' >> $@
|
||||
|
@ -159,12 +159,14 @@ endif
|
||||
## score translations with reverse translation model
|
||||
## normalize scores (see https://github.com/browsermt/students)
|
||||
|
||||
SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
|
||||
SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
|
||||
RAWSCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.raw-scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
|
||||
|
||||
.PHONY: score-translation score-translations
|
||||
score-translation: ${BITEXT_LATEST_SRC:.gz=.scores.gz}
|
||||
score-translations: ${SCOREFILES}
|
||||
sort-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz
|
||||
sort-raw-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted-raw.gz
|
||||
|
||||
print-score-file:
|
||||
echo ${BITEXT_LATEST_SRC:.gz=.scores.gz}
|
||||
@ -197,13 +199,52 @@ ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz: ${SCOREFILES}
|
||||
rm -f $@.src.gz $@.trg.gz
|
||||
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.sorted-raw.gz: ${RAWSCOREFILES}
|
||||
${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.raw-scores.gz | ${GZIP} -c > $@.raw-scores.gz
|
||||
${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.gz | ${GZIP} -c > $@.src.gz
|
||||
${GZCAT} ${OUTPUT_DIR}/latest/*.${TRG}.gz | ${GZIP} -c > $@.trg.gz
|
||||
paste <(gzip -cd $@.raw-scores.gz) <(gzip -cd $@.src.gz) <(gzip -cd $@.trg.gz) |\
|
||||
LC_ALL=C sort -n -k1,1 -S 10G | uniq -f1 | ${GZIP} -c > $@
|
||||
rm -f $@.src.gz $@.trg.gz
|
||||
|
||||
|
||||
# Part of the data to be removed (0.05 is 5%)
|
||||
# RETAIN - give a number in percent about how much to retain
|
||||
# REMOVE -
|
||||
#
|
||||
# (see https://github.com/browsermt/students)
|
||||
#
|
||||
#scored/%.best.gz: scored/%.sorted.gz
|
||||
# $(eval STARTLINE := $(shell pigz -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
|
||||
# @echo Removing $(REMOVE) removes $(STARTLINE) lines
|
||||
# pigz -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | pigz > $@
|
||||
|
||||
# REMOVE = 0.05
|
||||
# RETAIN = ${shell echo "100-100*${REMOVE}/1;" | bc}
|
||||
RETAIN = 95
|
||||
REMOVE = ${shell echo "scale=2; (100-${RETAIN})/100" | bc}
|
||||
|
||||
extract-best-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.best${RETAIN}.gz
|
||||
extract-rawbest-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.rawbest${RETAIN}.gz
|
||||
|
||||
%.${SRC}.best${RETAIN}.gz: %.sorted.gz
|
||||
$(eval STARTLINE := $(shell ${GZIP} -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
|
||||
@echo Removing $(REMOVE) removes $(STARTLINE) lines
|
||||
${GZIP} -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | \
|
||||
tee >(cut -f1 | gzip -c >$@) |\
|
||||
cut -f2 | gzip -c > ${@:.${SRC}.best${RETAIN}.gz=.${TRG}.best${RETAIN}.gz}
|
||||
|
||||
%.${TRG}.best${RETAIN}.gz: %.${SRC}.best${RETAIN}.gz
|
||||
@echo "done!"
|
||||
|
||||
%.${SRC}.rawbest${RETAIN}.gz: %.sorted-raw.gz
|
||||
$(eval STARTLINE := $(shell ${GZIP} -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
|
||||
@echo Removing $(REMOVE) removes $(STARTLINE) lines
|
||||
${GZIP} -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | \
|
||||
tee >(cut -f1 | gzip -c >$@) |\
|
||||
cut -f2 | gzip -c > ${@:.${SRC}.rawbest${RETAIN}.gz=.${TRG}.rawbest${RETAIN}.gz}
|
||||
|
||||
%.${TRG}.raawbest${RETAIN}.gz: %.${SRC}.rawbest${RETAIN}.gz
|
||||
@echo "done!"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user