added option for ce-filtered forward translations

This commit is contained in:
Joerg Tiedemann 2022-01-13 00:27:02 +02:00
parent a08ad41fbd
commit df1b4b2942
5 changed files with 92 additions and 8 deletions

View File

@ -740,6 +740,8 @@ ${WORKDIR}/${MODELCONFIG}:
@echo "TESTSET = ${TESTSET}" >> $@
@echo "PRE = ${PRE}" >> $@
@echo "SUBWORDS = ${SUBWORDS}" >> $@
@echo "MODEL_SRCVOCAB = ${MODEL_SRCVOCAB}" >> $@
@echo "MODEL_TRGVOCAB = ${MODEL_TRGVOCAB}" >> $@
ifdef SHUFFLE_DATA
@echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@
endif

View File

@ -82,6 +82,13 @@ ifeq (${USE_FORWARDTRANS},1)
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
endif
# forward-translation data (source-to-target)
# filtered by reconstruction scores (ce filter)
ifneq (${USE_FORWARDTRANS_SELECTED},)
FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
endif
# forward-translation data of monolingual data (source-to-target)
ifeq (${USE_FORWARDTRANSMONO},1)
FORWARDTRANSMONO_SRC = ${sort ${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.gz}}

View File

@ -321,6 +321,29 @@ endif
# MODELCONFIG=config-ft.mk \
# use a selected set of forward translation
# default for ce-filter
FT_SELECTED ?= 95
%-ftbest:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \
${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \
RETAIN=${FT_SELECTED} extract-best-translations; \
fi \
done \
done
${MAKE} DATASET=${DATASET}+ft${FT_SELECTED} \
USE_FORWARDTRANS_SELECTED=${FT_SELECTED} \
${@:-ftbest=}
## add forward translation of monolingual data
%-ftmono:
${MAKE} DATASET=${DATASET}+ftmono USE_FORWARDTRANSMONO=1 ${@:-ftmono=}
@ -340,8 +363,9 @@ endif
## don't use the regular parallel training data
## (only makes sense if bt, ft, or pivot-based data are activated)
%-nopar:
${MAKE} DATASET=${DATASET}+nopar TRAINSET= TATOEBA_TRAINSET= ${@:-nopar=}
${MAKE} DATASET=${DATASET}+nopar TRAINSET= ${@:-nopar=}
# TATOEBA_TRAINSET=
##-------------------------------------------------------------
## default: make separate sentencepiece models
@ -371,9 +395,10 @@ endif
USE_FORWARDTRANS=1 \
CONTINUE_EXISTING=1 \
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
TRAINSET= TATOEBA_TRAINSET= \
TRAINSET= \
${@:-ftonly=}
# TATOEBA_TRAINSET= \
# MODELCONFIG=config-ft.mk \

View File

@ -20,6 +20,9 @@ endif
SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR})
## comma separated nodes to be excluded
# BROKEN_NODES = g6301
%.submit:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
@ -36,6 +39,9 @@ endif
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
ifdef BROKEN_NODES
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
endif
echo '${HPC_EXTRA}' >> $@
echo '${HPC_EXTRA1}' >> $@
echo '${HPC_EXTRA2}' >> $@
@ -73,6 +79,9 @@ endif
echo '#SBATCH -N ${HPC_NODES}' >> $@
echo '#SBATCH -p ${HPC_QUEUE}' >> $@
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
ifdef BROKEN_NODES
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
endif
echo '${HPC_EXTRA}' >> $@
echo '${HPC_EXTRA1}' >> $@
echo '${HPC_EXTRA2}' >> $@

View File

@ -159,12 +159,14 @@ endif
## score translations with reverse translation model
## normalize scores (see https://github.com/browsermt/students)
SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
RAWSCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.raw-scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
.PHONY: score-translation score-translations
score-translation: ${BITEXT_LATEST_SRC:.gz=.scores.gz}
score-translations: ${SCOREFILES}
sort-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz
sort-raw-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted-raw.gz
print-score-file:
echo ${BITEXT_LATEST_SRC:.gz=.scores.gz}
@ -197,13 +199,52 @@ ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz: ${SCOREFILES}
rm -f $@.src.gz $@.trg.gz
${OUTPUT_DIR}/latest/Tatoeba-train.sorted-raw.gz: ${RAWSCOREFILES}
${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.raw-scores.gz | ${GZIP} -c > $@.raw-scores.gz
${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.gz | ${GZIP} -c > $@.src.gz
${GZCAT} ${OUTPUT_DIR}/latest/*.${TRG}.gz | ${GZIP} -c > $@.trg.gz
paste <(gzip -cd $@.raw-scores.gz) <(gzip -cd $@.src.gz) <(gzip -cd $@.trg.gz) |\
LC_ALL=C sort -n -k1,1 -S 10G | uniq -f1 | ${GZIP} -c > $@
rm -f $@.src.gz $@.trg.gz
# Part of the data to be removed (0.05 is 5%)
# RETAIN - give a number in percent about how much to retain
# REMOVE -
#
# (see https://github.com/browsermt/students)
#
#scored/%.best.gz: scored/%.sorted.gz
# $(eval STARTLINE := $(shell pigz -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
# @echo Removing $(REMOVE) removes $(STARTLINE) lines
# pigz -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | pigz > $@
# REMOVE = 0.05
# RETAIN = ${shell echo "100-100*${REMOVE}/1;" | bc}
RETAIN = 95
REMOVE = ${shell echo "scale=2; (100-${RETAIN})/100" | bc}
extract-best-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.best${RETAIN}.gz
extract-rawbest-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.rawbest${RETAIN}.gz
%.${SRC}.best${RETAIN}.gz: %.sorted.gz
$(eval STARTLINE := $(shell ${GZIP} -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
@echo Removing $(REMOVE) removes $(STARTLINE) lines
${GZIP} -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | \
tee >(cut -f1 | gzip -c >$@) |\
cut -f2 | gzip -c > ${@:.${SRC}.best${RETAIN}.gz=.${TRG}.best${RETAIN}.gz}
%.${TRG}.best${RETAIN}.gz: %.${SRC}.best${RETAIN}.gz
@echo "done!"
%.${SRC}.rawbest${RETAIN}.gz: %.sorted-raw.gz
$(eval STARTLINE := $(shell ${GZIP} -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
@echo Removing $(REMOVE) removes $(STARTLINE) lines
${GZIP} -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | \
tee >(cut -f1 | gzip -c >$@) |\
cut -f2 | gzip -c > ${@:.${SRC}.rawbest${RETAIN}.gz=.${TRG}.rawbest${RETAIN}.gz}
%.${TRG}.raawbest${RETAIN}.gz: %.${SRC}.rawbest${RETAIN}.gz
@echo "done!"