mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 12:32:24 +03:00
forward translation fixes
This commit is contained in:
parent
88d509e9ef
commit
e3cf48786e
@ -16,6 +16,17 @@ include ${REPOHOME}lib/slurm.mk
|
||||
SRC = fin
|
||||
TRG = eng
|
||||
|
||||
|
||||
## change decoder settings
|
||||
## TODO: do we need this?
|
||||
|
||||
MARIAN_BEAM_SIZE=1
|
||||
MARIAN_MINI_BATCH=100
|
||||
MARIAN_MAXI_BATCH=100
|
||||
MARIAN_MAX_LENGTH=200
|
||||
MARIAN_WORKSPACE=12000
|
||||
|
||||
|
||||
TATOEBA_VERSION ?= v2021-08-07
|
||||
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
|
||||
|
||||
@ -80,13 +91,13 @@ BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
|
||||
|
||||
|
||||
## all parts of the bitext
|
||||
PARTS = ${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}}
|
||||
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
|
||||
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
|
||||
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
|
||||
|
||||
|
||||
## don't delete translated text even if the process crashes
|
||||
.PRECIOUS: ${BITEXT_TRG}
|
||||
.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
|
||||
.PHONY: all
|
||||
all: translate
|
||||
@ -101,7 +112,11 @@ translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
|
||||
## translate all parts
|
||||
.PHONY: translate-all-parts
|
||||
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
|
||||
${MAKE} ${ALL_BITEXT_LATEST_TRG}
|
||||
${MAKE} ${ALL_BITEXT_LATEST_SRC}
|
||||
|
||||
.PHONY: source-all-parts
|
||||
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
|
||||
|
||||
|
||||
.PHONY: print-modelinfo
|
||||
print-modelinfo:
|
||||
@ -111,7 +126,6 @@ print-modelinfo:
|
||||
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
|
||||
@echo "target language label: ${TARGET_LANG_LABEL}"
|
||||
|
||||
|
||||
## fetch the latest model
|
||||
|
||||
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
||||
@ -168,8 +182,8 @@ endif
|
||||
## merge SentencePiece segments in the source text
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${BITEXT_SRC}: ${BITEXT_PRE}
|
||||
ifneq (${MODELZIP},)
|
||||
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
ifneq ($(wildcard ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@}),)
|
||||
mkdir -p ${dir $@}
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
@ -185,11 +199,11 @@ endif
|
||||
## --> this allows multiple translation iterations
|
||||
## without duplicating the data we want to use in MT training
|
||||
|
||||
${BITEXT_LATEST_SRC}: ${BITEXT_SRC}
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${BITEXT_LATEST_TRG}: ${BITEXT_TRG}
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
@ -204,9 +218,10 @@ ${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
|
||||
-i ${PWD}/$< \
|
||||
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && \
|
||||
${MARIAN_DECODER} \
|
||||
-c decoder.yml \
|
||||
-i ${PWD}/$< \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
@ -215,8 +230,6 @@ endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
check-latest:
|
||||
@if [ -d ${LANGPAIR}/latest ]; then \
|
||||
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
|
||||
@ -232,7 +245,7 @@ check-latest:
|
||||
fi
|
||||
|
||||
check-translated:
|
||||
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
|
||||
@for S in `ls ${LANGPAIR}/*.${SRC}.spm.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
|
@ -539,12 +539,16 @@ endif
|
||||
|
||||
## decoder flags (CPU and GPU variants)
|
||||
|
||||
MARIAN_DECODER_GPU = -b 4 -n1 -d ${MARIAN_GPUS} --quiet-translation -w ${MARIAN_WORKSPACE} \
|
||||
--mini-batch 768 --maxi-batch 2048 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
|
||||
MARIAN_DECODER_CPU = -b 4 -n1 --cpu-threads ${HPC_CORES} --quiet-translation \
|
||||
MARIAN_BEAM_SIZE = 4
|
||||
MARIAN_MINI_BATCH = 768
|
||||
MARIAN_MAXI_BATCH = 2048
|
||||
|
||||
MARIAN_DECODER_GPU = -b ${MARIAN_BEAM_SIZE} -n1 -d ${MARIAN_GPUS} -w ${MARIAN_WORKSPACE} \
|
||||
--mini-batch ${MARIAN_MINI_BATCH} --maxi-batch ${MARIAN_MAXI_BATCH} --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16 --quiet-translation
|
||||
MARIAN_DECODER_CPU = -b ${MARIAN_BEAM_SIZE} -n1 --cpu-threads ${HPC_CORES} \
|
||||
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16 --quiet-translation
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
|
||||
|
||||
|
||||
|
@ -1986,7 +1986,6 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
|
||||
@if [ -e $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) ]; then \
|
||||
for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
|
||||
for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
|
||||
if [[ "$$s-$$t" != "${LANGPAIR}" ]] && [[ "$$t-$$s" != "${LANGPAIR}" ]]; then \
|
||||
if [ "$$s" \< "$$t" ]; then \
|
||||
echo "extract $$s-$$t data"; \
|
||||
for d in dev test train; do \
|
||||
@ -2018,15 +2017,14 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
|
||||
if [ -s ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s ]; then \
|
||||
echo "........ make ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.*.gz"; \
|
||||
cut -f1 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | \
|
||||
${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$t.gz; \
|
||||
cut -f2 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | \
|
||||
${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$s.gz; \
|
||||
cut -f2 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | \
|
||||
${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$t.gz; \
|
||||
fi; \
|
||||
rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s; \
|
||||
fi \
|
||||
done \
|
||||
fi \
|
||||
fi \
|
||||
done \
|
||||
done \
|
||||
fi
|
||||
|
Loading…
Reference in New Issue
Block a user