diff --git a/lib/config.mk b/lib/config.mk index 8e90eec8..4cb089bc 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -740,6 +740,8 @@ ${WORKDIR}/${MODELCONFIG}: @echo "TESTSET = ${TESTSET}" >> $@ @echo "PRE = ${PRE}" >> $@ @echo "SUBWORDS = ${SUBWORDS}" >> $@ + @echo "MODEL_SRCVOCAB = ${MODEL_SRCVOCAB}" >> $@ + @echo "MODEL_TRGVOCAB = ${MODEL_TRGVOCAB}" >> $@ ifdef SHUFFLE_DATA @echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@ endif diff --git a/lib/data.mk b/lib/data.mk index 90763871..0acedf68 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -82,6 +82,13 @@ ifeq (${USE_FORWARDTRANS},1) FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}} endif +# forward-translation data (source-to-target) +# filtered by reconstruction scores (ce filter) +ifneq (${USE_FORWARDTRANS_SELECTED},) + FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}} + FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}} +endif + # forward-translation data of monolingual data (source-to-target) ifeq (${USE_FORWARDTRANSMONO},1) FORWARDTRANSMONO_SRC = ${sort ${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.gz}} diff --git a/lib/generic.mk b/lib/generic.mk index 86679dbc..4d193b82 100644 --- a/lib/generic.mk +++ b/lib/generic.mk @@ -321,6 +321,29 @@ endif # MODELCONFIG=config-ft.mk \ + +# use a selected set of forward translation + +# default for ce-filter +FT_SELECTED ?= 95 + +%-ftbest: + for s in ${SRCLANGS}; do \ + for t in ${TRGLANGS}; do \ + if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \ + ${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \ + RETAIN=${FT_SELECTED} extract-best-translations; \ + fi \ + done \ + done + ${MAKE} DATASET=${DATASET}+ft${FT_SELECTED} \ + USE_FORWARDTRANS_SELECTED=${FT_SELECTED} \ + ${@:-ftbest=} + + + + + ## add forward translation of monolingual data %-ftmono: ${MAKE} DATASET=${DATASET}+ftmono USE_FORWARDTRANSMONO=1 ${@:-ftmono=} @@ -340,8 +363,9 @@ endif ## don't use the regular parallel training data ## (only makes sense if bt, ft, or pivot-based data are activated) %-nopar: - ${MAKE} DATASET=${DATASET}+nopar TRAINSET= TATOEBA_TRAINSET= ${@:-nopar=} + ${MAKE} DATASET=${DATASET}+nopar TRAINSET= ${@:-nopar=} +# TATOEBA_TRAINSET= ##------------------------------------------------------------- ## default: make separate sentencepiece models @@ -371,9 +395,10 @@ endif USE_FORWARDTRANS=1 \ CONTINUE_EXISTING=1 \ MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \ - TRAINSET= TATOEBA_TRAINSET= \ + TRAINSET= \ ${@:-ftonly=} +# TATOEBA_TRAINSET= \ # MODELCONFIG=config-ft.mk \ diff --git a/lib/slurm.mk b/lib/slurm.mk index ad965f0f..b8eb6b07 100644 --- a/lib/slurm.mk +++ b/lib/slurm.mk @@ -20,6 +20,9 @@ endif SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR}) +## comma separated nodes to be excluded +# BROKEN_NODES = g6301 + %.submit: mkdir -p ${WORKDIR} echo '#!/bin/bash -l' > $@ @@ -36,6 +39,9 @@ endif echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@ echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@ echo '#SBATCH -t ${HPC_TIME}:00' >> $@ +ifdef BROKEN_NODES + echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@ +endif echo '${HPC_EXTRA}' >> $@ echo '${HPC_EXTRA1}' >> $@ echo '${HPC_EXTRA2}' >> $@ @@ -73,6 +79,9 @@ endif echo '#SBATCH -N ${HPC_NODES}' >> $@ echo '#SBATCH -p ${HPC_QUEUE}' >> $@ echo '#SBATCH -t ${HPC_TIME}:00' >> $@ +ifdef BROKEN_NODES + echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@ +endif echo '${HPC_EXTRA}' >> $@ echo '${HPC_EXTRA1}' >> $@ echo '${HPC_EXTRA2}' >> $@ diff --git a/tatoeba/forward-translate/Makefile b/tatoeba/forward-translate/Makefile index 3b572925..a9568491 100644 --- a/tatoeba/forward-translate/Makefile +++ b/tatoeba/forward-translate/Makefile @@ -159,12 +159,14 @@ endif ## score translations with reverse translation model ## normalize scores (see https://github.com/browsermt/students) -SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz)) +SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz)) +RAWSCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.raw-scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz)) .PHONY: score-translation score-translations score-translation: ${BITEXT_LATEST_SRC:.gz=.scores.gz} score-translations: ${SCOREFILES} sort-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz +sort-raw-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted-raw.gz print-score-file: echo ${BITEXT_LATEST_SRC:.gz=.scores.gz} @@ -197,13 +199,52 @@ ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz: ${SCOREFILES} rm -f $@.src.gz $@.trg.gz +${OUTPUT_DIR}/latest/Tatoeba-train.sorted-raw.gz: ${RAWSCOREFILES} + ${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.raw-scores.gz | ${GZIP} -c > $@.raw-scores.gz + ${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.gz | ${GZIP} -c > $@.src.gz + ${GZCAT} ${OUTPUT_DIR}/latest/*.${TRG}.gz | ${GZIP} -c > $@.trg.gz + paste <(gzip -cd $@.raw-scores.gz) <(gzip -cd $@.src.gz) <(gzip -cd $@.trg.gz) |\ + LC_ALL=C sort -n -k1,1 -S 10G | uniq -f1 | ${GZIP} -c > $@ + rm -f $@.src.gz $@.trg.gz + + +# Part of the data to be removed (0.05 is 5%) +# RETAIN - give a number in percent about how much to retain +# REMOVE - # # (see https://github.com/browsermt/students) -# -#scored/%.best.gz: scored/%.sorted.gz -# $(eval STARTLINE := $(shell pigz -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.)) -# @echo Removing $(REMOVE) removes $(STARTLINE) lines -# pigz -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | pigz > $@ + +# REMOVE = 0.05 +# RETAIN = ${shell echo "100-100*${REMOVE}/1;" | bc} +RETAIN = 95 +REMOVE = ${shell echo "scale=2; (100-${RETAIN})/100" | bc} + +extract-best-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.best${RETAIN}.gz +extract-rawbest-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.rawbest${RETAIN}.gz + +%.${SRC}.best${RETAIN}.gz: %.sorted.gz + $(eval STARTLINE := $(shell ${GZIP} -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.)) + @echo Removing $(REMOVE) removes $(STARTLINE) lines + ${GZIP} -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | \ + tee >(cut -f1 | gzip -c >$@) |\ + cut -f2 | gzip -c > ${@:.${SRC}.best${RETAIN}.gz=.${TRG}.best${RETAIN}.gz} + +%.${TRG}.best${RETAIN}.gz: %.${SRC}.best${RETAIN}.gz + @echo "done!" + +%.${SRC}.rawbest${RETAIN}.gz: %.sorted-raw.gz + $(eval STARTLINE := $(shell ${GZIP} -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.)) + @echo Removing $(REMOVE) removes $(STARTLINE) lines + ${GZIP} -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | \ + tee >(cut -f1 | gzip -c >$@) |\ + cut -f2 | gzip -c > ${@:.${SRC}.rawbest${RETAIN}.gz=.${TRG}.rawbest${RETAIN}.gz} + +%.${TRG}.raawbest${RETAIN}.gz: %.${SRC}.rawbest${RETAIN}.gz + @echo "done!" + + + +