finetune branch downloads models from object storage

This commit is contained in:
Joerg Tiedemann 2020-02-15 23:40:55 +02:00
parent 325b4c1903
commit 1a7cbbb13e
5 changed files with 230157 additions and 50 deletions

View File

@ -697,6 +697,24 @@ add-to-local-mono-data:
$(TOKENIZER)/deescape-special-chars.perl |\ $(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
## remove all spaces (treat everything as a long string)
%.nospace: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
sed 's/ /▁/g' > $@
%.nospace.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
sed 's/ /▁/g' |\
gzip -c > $@
## increase max number of tokens to 250 ## increase max number of tokens to 250
@ -840,6 +858,8 @@ endif
## if a new data set is used ## if a new data set is used
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
# SPMEXTRA = --split_by_whitespace=false
SPMEXTRA =
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} .PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
@ -854,11 +874,11 @@ else
endif endif
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq ${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \ if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \ --character_coverage=0.9995 --hard_vocab_limit=false; \
else \ else \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
--character_coverage=1.0 --hard_vocab_limit=false; \ --character_coverage=1.0 --hard_vocab_limit=false; \
fi fi
@ -880,11 +900,11 @@ ifeq ($(wildcard ${SPMTRGMODEL}),)
grep . $< | shuf > $<.text grep . $< | shuf > $<.text
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq ${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \ if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \ --character_coverage=0.9995 --hard_vocab_limit=false; \
else \ else \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=1.0 --hard_vocab_limit=false; \ --character_coverage=1.0 --hard_vocab_limit=false; \
fi fi
@ -962,11 +982,11 @@ ifeq ($(wildcard ${SPMMODEL}),)
grep . $< | shuf > $<.text grep . $< | shuf > $<.text
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq ${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \ if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \ --character_coverage=0.9995 --hard_vocab_limit=false; \
else \ else \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=1.0 --hard_vocab_limit=false; \ --character_coverage=1.0 --hard_vocab_limit=false; \
fi fi

View File

@ -133,6 +133,16 @@ endif
PRE_TRG=spm${TRGBPESIZE:000=}k \ PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=} ${@:-spm=}
## sentence-piece models with space-separated strings
%-nospace:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-nospace} \
PRE=simple \
SPMEXTRA=--split_by_whitespace=false \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-nospace=}
## with SPM models trained on monolingual data ## with SPM models trained on monolingual data
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO} %-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \ ${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \

View File

@ -339,14 +339,17 @@ memad:
for s in fi en sv de fr nl; do \ for s in fi en sv de fr nl; do \
for t in en fi sv de fr nl; do \ for t in en fi sv de fr nl; do \
if [ "$$s" != "$$t" ]; then \ if [ "$$s" != "$$t" ]; then \
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/*.valid${NR.log}; then\ if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/${DATASET}.*.valid${NR.log}; then\
${MAKE} SRCLANGS=$$s TRGLANGS=$$t traindata devdata wordalign; \ ${MAKE} SRCLANGS=$$s TRGLANGS=$$t bilingual-dynamic; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
fi \ fi \
fi \ fi \
done \ done \
done done
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t data; \
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
doclevel: doclevel:
${MAKE} ost-datasets ${MAKE} ost-datasets

View File

@ -46,17 +46,23 @@ TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}}
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}} DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}} TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
OBJECTSTORAGE = https://object.pouta.csc.fi
MODELCONTAINER = OPUS-MT-models
MODELINDEX = ${OBJECTSTORAGE}/${MODELCONTAINER}/index.txt
BASEMODELHOME = ../models/${LANGPAIR} # BASEMODELHOME = ../models/${LANGPAIR}
BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}} # BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}} # BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
BASEMODELHOME = ${OBJECTSTORAGE}/${MODELCONTAINER}/${LANGPAIR}
BASEMODELZIP = ${lastword ${sort ${notdir ${shell grep '${LANGPAIR}/opus-.*\.zip' model-index.txt}}}}
BASEMODELNAME = ${BASEMODELZIP:.zip=}
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
MARIAN_WORKSPACE = 5000 MARIAN_WORKSPACE = 5000
MARIAN_VALID_FREQ = 100 MARIAN_VALID_FREQ = 100
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ} MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ} MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
@ -64,11 +70,15 @@ MARIAN_EARLY_STOPPING = 5
.PHONY: all .PHONY: all
all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare all: model-index.txt
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
model-index.txt:
wget -nv -O $@ ${MODELINDEX}
## convert a TMX file to create dev-test-train data ## convert a TMX file to create dev-test-train data
## and start fine-tuning in the direction of sorted lang-IDs ## and start fine-tuning in the direction of sorted lang-IDs
@ -79,52 +89,54 @@ all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.ba
## - it assumes that ${TMX} points to a valid TMX files ## - it assumes that ${TMX} points to a valid TMX files
## - it assumes that there are only 2 languages in the TMX (it will only use 2) ## - it assumes that there are only 2 languages in the TMX (it will only use 2)
TMX = vero-20200123.tmx.gz
TMXFILE = VNK-Hallituksen_vuosikertomus.tmx
TMXBASE = ${TMXFILE:.tmx=}
REVERSE = 0 REVERSE = 0
tmx-tune: tmx-tune:
zcat ${TMX} |\ cat ${TMXFILE} |\
tmx2moses -r -o ${TMX:.tmx.gz=} tmx2moses -r -o ${TMXBASE}
if [ ${REVERSE} -gt 0 ]; then \ if [ ${REVERSE} -gt 0 ]; then \
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \ t=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \ s=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
else \ else \
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \ s=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \ t=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
fi; \ fi; \
echo $$s; echo $$t; \ echo $$s; echo $$t; \
mkdir -p $$s-$$t; \ mkdir -p $$s-$$t; \
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \ paste ${TMXBASE}.*-*.$$s ${TMXBASE}.*-*.$$t | \
sort | uniq | \ sort | uniq | \
python3 ../bitext-match-lang.py -s $$s -t $$t | \ python3 ../bitext-match-lang.py -s $$s -t $$t | \
grep -v '[<>{}]' |\ grep -v '[<>{}]' |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\ $(TOKENIZER)/replace-unicode-punctuation.perl |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \ shuf > ${TMXBASE}.$$s-$$t.shuffled; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \ mkdir -p $$s-$$t/${TMXBASE}/dev; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \ mkdir -p $$s-$$t/${TMXBASE}/test; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \ mkdir -p $$s-$$t/${TMXBASE}/train; \
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \ head -1000 ${TMXBASE}.$$s-$$t.shuffled | cut -f1 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \ > $$s-$$t/${TMXBASE}/test/${TMXBASE}.$$s.gz; \
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \ head -1000 ${TMXBASE}.$$s-$$t.shuffled | cut -f2 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \ > $$s-$$t/${TMXBASE}/test/${TMXBASE}.$$t.gz; \
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \ head -2001 ${TMXBASE}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \ > $$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$s.gz; \
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \ head -2001 ${TMXBASE}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \ > $$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$t.gz; \
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \ tail -n +2002 ${TMXBASE}.$$s-$$t.shuffled | cut -f1 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \ > $$s-$$t/${TMXBASE}/train/${TMXBASE}.$$s.gz; \
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \ tail -n +2002 ${TMXBASE}.$$s-$$t.shuffled | cut -f2 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \ > $$s-$$t/${TMXBASE}/train/${TMXBASE}.$$t.gz; \
mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \ mv ${TMXBASE}.*-* $$s-$$t/; \
${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \ ${MAKE} SRC=$$s TRG=$$t MODEL=${TMXBASE} \
TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \ TRAIN_SRC=$$s-$$t/${TMXBASE}/train/${TMXBASE}.$$s \
TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \ TRAIN_TRG=$$s-$$t/${TMXBASE}/train/${TMXBASE}.$$t \
DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \ DEV_SRC=$$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$s \
DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \ DEV_TRG=$$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$t \
TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \ TEST_SRC=$$s-$$t/${TMXBASE}/test/${TMXBASE}.$$s \
TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \ TEST_TRG=$$s-$$t/${TMXBASE}/test/${TMXBASE}.$$t \
all all
@ -244,11 +256,15 @@ endif
.PHONY: data .PHONY: data
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
${LANGPAIR}/${BASEMODELNAME}/decoder.yml: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
mkdir -p ${dir $@} mkdir -p ${dir $@}
cp ${BASEMODELZIP} ${dir $@} ifneq (${BASEMODELZIP},)
cd ${dir $@} && unzip -u *.zip wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
cd ${dir $@} && unzip -u ${BASEMODELZIP}
else
@echo "no model found for ${LANGAIR}!"
endif
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz .INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml

File diff suppressed because it is too large Load Diff