forward translation fixes

This commit is contained in:
Joerg Tiedemann 2021-12-13 13:39:48 +02:00
parent 88d509e9ef
commit e3cf48786e
3 changed files with 37 additions and 22 deletions

View File

@ -16,6 +16,17 @@ include ${REPOHOME}lib/slurm.mk
SRC = fin
TRG = eng
## change decoder settings
## TODO: do we need this?
MARIAN_BEAM_SIZE=1
MARIAN_MINI_BATCH=100
MARIAN_MAXI_BATCH=100
MARIAN_MAX_LENGTH=200
MARIAN_WORKSPACE=12000
TATOEBA_VERSION ?= v2021-08-07
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
@ -80,13 +91,13 @@ BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
## all parts of the bitext
PARTS = ${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}}
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
## don't delete translated text even if the process crashes
.PRECIOUS: ${BITEXT_TRG}
.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz
.PHONY: all
all: translate
@ -101,7 +112,11 @@ translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
${MAKE} ${ALL_BITEXT_LATEST_TRG}
${MAKE} ${ALL_BITEXT_LATEST_SRC}
.PHONY: source-all-parts
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
.PHONY: print-modelinfo
print-modelinfo:
@ -111,7 +126,6 @@ print-modelinfo:
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
@echo "target language label: ${TARGET_LANG_LABEL}"
## fetch the latest model
${LANGPAIR}/${MODELNAME}/decoder.yml:
@ -168,8 +182,8 @@ endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${BITEXT_SRC}: ${BITEXT_PRE}
ifneq (${MODELZIP},)
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
ifneq ($(wildcard ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@}),)
mkdir -p ${dir $@}
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
@ -185,11 +199,11 @@ endif
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${BITEXT_LATEST_SRC}: ${BITEXT_SRC}
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
mkdir -p ${dir $@}
cp $< $@
${BITEXT_LATEST_TRG}: ${BITEXT_TRG}
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
mkdir -p ${dir $@}
cp $< $@
@ -204,9 +218,10 @@ ${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
-i ${PWD}/$< \
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && \
${MARIAN_DECODER} \
-c decoder.yml \
-i ${PWD}/$< \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
@ -215,8 +230,6 @@ endif
check-latest:
@if [ -d ${LANGPAIR}/latest ]; then \
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
@ -232,7 +245,7 @@ check-latest:
fi
check-translated:
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
@for S in `ls ${LANGPAIR}/*.${SRC}.spm.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \

View File

@ -539,12 +539,16 @@ endif
## decoder flags (CPU and GPU variants)
MARIAN_DECODER_GPU = -b 4 -n1 -d ${MARIAN_GPUS} --quiet-translation -w ${MARIAN_WORKSPACE} \
--mini-batch 768 --maxi-batch 2048 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
MARIAN_DECODER_CPU = -b 4 -n1 --cpu-threads ${HPC_CORES} --quiet-translation \
MARIAN_BEAM_SIZE = 4
MARIAN_MINI_BATCH = 768
MARIAN_MAXI_BATCH = 2048
MARIAN_DECODER_GPU = -b ${MARIAN_BEAM_SIZE} -n1 -d ${MARIAN_GPUS} -w ${MARIAN_WORKSPACE} \
--mini-batch ${MARIAN_MINI_BATCH} --maxi-batch ${MARIAN_MAXI_BATCH} --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16 --quiet-translation
MARIAN_DECODER_CPU = -b ${MARIAN_BEAM_SIZE} -n1 --cpu-threads ${HPC_CORES} \
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16 --quiet-translation
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}

View File

@ -1986,7 +1986,6 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
@if [ -e $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) ]; then \
for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
if [[ "$$s-$$t" != "${LANGPAIR}" ]] && [[ "$$t-$$s" != "${LANGPAIR}" ]]; then \
if [ "$$s" \< "$$t" ]; then \
echo "extract $$s-$$t data"; \
for d in dev test train; do \
@ -2018,15 +2017,14 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
if [ -s ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s ]; then \
echo "........ make ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.*.gz"; \
cut -f1 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | \
${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$t.gz; \
cut -f2 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | \
${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$s.gz; \
cut -f2 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | \
${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$t.gz; \
fi; \
rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s; \
fi \
done \
fi \
fi \
done \
done \
fi