mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 22:14:14 +03:00
finetune branch downloads models from object storage
This commit is contained in:
parent
325b4c1903
commit
1a7cbbb13e
@ -697,6 +697,24 @@ add-to-local-mono-data:
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||
|
||||
## remove all spaces (treat everything as a long string)
|
||||
%.nospace: %.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
sed 's/ /▁/g' > $@
|
||||
|
||||
%.nospace.gz: %.gz
|
||||
$(LOAD_MOSES) zcat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
sed 's/ /▁/g' |\
|
||||
gzip -c > $@
|
||||
|
||||
|
||||
|
||||
## increase max number of tokens to 250
|
||||
@ -840,6 +858,8 @@ endif
|
||||
## if a new data set is used
|
||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
||||
# SPMEXTRA = --split_by_whitespace=false
|
||||
SPMEXTRA =
|
||||
|
||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
|
||||
@ -854,11 +874,11 @@ else
|
||||
endif
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
|
||||
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
${SPM_HOME}/spm_train \
|
||||
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||
else \
|
||||
${SPM_HOME}/spm_train \
|
||||
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
@ -880,11 +900,11 @@ ifeq ($(wildcard ${SPMTRGMODEL}),)
|
||||
grep . $< | shuf > $<.text
|
||||
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
|
||||
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
${SPM_HOME}/spm_train \
|
||||
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||
else \
|
||||
${SPM_HOME}/spm_train \
|
||||
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
@ -962,11 +982,11 @@ ifeq ($(wildcard ${SPMMODEL}),)
|
||||
grep . $< | shuf > $<.text
|
||||
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
${SPM_HOME}/spm_train \
|
||||
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||
else \
|
||||
${SPM_HOME}/spm_train \
|
||||
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
|
@ -133,6 +133,16 @@ endif
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm=}
|
||||
|
||||
## sentence-piece models with space-separated strings
|
||||
%-nospace:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-nospace} \
|
||||
PRE=simple \
|
||||
SPMEXTRA=--split_by_whitespace=false \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-nospace=}
|
||||
|
||||
|
||||
## with SPM models trained on monolingual data
|
||||
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
|
||||
|
@ -339,14 +339,17 @@ memad:
|
||||
for s in fi en sv de fr nl; do \
|
||||
for t in en fi sv de fr nl; do \
|
||||
if [ "$$s" != "$$t" ]; then \
|
||||
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/*.valid${NR.log}; then\
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t traindata devdata wordalign; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
|
||||
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/${DATASET}.*.valid${NR.log}; then\
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t bilingual-dynamic; \
|
||||
fi \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t data; \
|
||||
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
|
||||
|
||||
|
||||
|
||||
doclevel:
|
||||
${MAKE} ost-datasets
|
||||
|
@ -46,17 +46,23 @@ TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}}
|
||||
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
|
||||
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
|
||||
|
||||
OBJECTSTORAGE = https://object.pouta.csc.fi
|
||||
MODELCONTAINER = OPUS-MT-models
|
||||
MODELINDEX = ${OBJECTSTORAGE}/${MODELCONTAINER}/index.txt
|
||||
|
||||
BASEMODELHOME = ../models/${LANGPAIR}
|
||||
BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
|
||||
BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
|
||||
# BASEMODELHOME = ../models/${LANGPAIR}
|
||||
# BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
|
||||
# BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
|
||||
|
||||
BASEMODELHOME = ${OBJECTSTORAGE}/${MODELCONTAINER}/${LANGPAIR}
|
||||
BASEMODELZIP = ${lastword ${sort ${notdir ${shell grep '${LANGPAIR}/opus-.*\.zip' model-index.txt}}}}
|
||||
BASEMODELNAME = ${BASEMODELZIP:.zip=}
|
||||
|
||||
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
|
||||
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
|
||||
|
||||
|
||||
MARIAN_WORKSPACE = 5000
|
||||
MARIAN_WORKSPACE = 5000
|
||||
MARIAN_VALID_FREQ = 100
|
||||
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
|
||||
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
|
||||
@ -64,11 +70,15 @@ MARIAN_EARLY_STOPPING = 5
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
||||
all: model-index.txt
|
||||
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
||||
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
||||
|
||||
|
||||
model-index.txt:
|
||||
wget -nv -O $@ ${MODELINDEX}
|
||||
|
||||
|
||||
## convert a TMX file to create dev-test-train data
|
||||
## and start fine-tuning in the direction of sorted lang-IDs
|
||||
@ -79,52 +89,54 @@ all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.ba
|
||||
## - it assumes that ${TMX} points to a valid TMX files
|
||||
## - it assumes that there are only 2 languages in the TMX (it will only use 2)
|
||||
|
||||
TMX = vero-20200123.tmx.gz
|
||||
|
||||
TMXFILE = VNK-Hallituksen_vuosikertomus.tmx
|
||||
TMXBASE = ${TMXFILE:.tmx=}
|
||||
REVERSE = 0
|
||||
|
||||
tmx-tune:
|
||||
zcat ${TMX} |\
|
||||
tmx2moses -r -o ${TMX:.tmx.gz=}
|
||||
cat ${TMXFILE} |\
|
||||
tmx2moses -r -o ${TMXBASE}
|
||||
if [ ${REVERSE} -gt 0 ]; then \
|
||||
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||
t=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||
s=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||
else \
|
||||
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||
s=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||
t=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||
fi; \
|
||||
echo $$s; echo $$t; \
|
||||
mkdir -p $$s-$$t; \
|
||||
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
|
||||
paste ${TMXBASE}.*-*.$$s ${TMXBASE}.*-*.$$t | \
|
||||
sort | uniq | \
|
||||
python3 ../bitext-match-lang.py -s $$s -t $$t | \
|
||||
grep -v '[<>{}]' |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
|
||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
|
||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
|
||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \
|
||||
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \
|
||||
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \
|
||||
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \
|
||||
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \
|
||||
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \
|
||||
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \
|
||||
mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \
|
||||
${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \
|
||||
TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \
|
||||
TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \
|
||||
DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \
|
||||
DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \
|
||||
TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \
|
||||
TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \
|
||||
shuf > ${TMXBASE}.$$s-$$t.shuffled; \
|
||||
mkdir -p $$s-$$t/${TMXBASE}/dev; \
|
||||
mkdir -p $$s-$$t/${TMXBASE}/test; \
|
||||
mkdir -p $$s-$$t/${TMXBASE}/train; \
|
||||
head -1000 ${TMXBASE}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMXBASE}/test/${TMXBASE}.$$s.gz; \
|
||||
head -1000 ${TMXBASE}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMXBASE}/test/${TMXBASE}.$$t.gz; \
|
||||
head -2001 ${TMXBASE}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$s.gz; \
|
||||
head -2001 ${TMXBASE}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$t.gz; \
|
||||
tail -n +2002 ${TMXBASE}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMXBASE}/train/${TMXBASE}.$$s.gz; \
|
||||
tail -n +2002 ${TMXBASE}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMXBASE}/train/${TMXBASE}.$$t.gz; \
|
||||
mv ${TMXBASE}.*-* $$s-$$t/; \
|
||||
${MAKE} SRC=$$s TRG=$$t MODEL=${TMXBASE} \
|
||||
TRAIN_SRC=$$s-$$t/${TMXBASE}/train/${TMXBASE}.$$s \
|
||||
TRAIN_TRG=$$s-$$t/${TMXBASE}/train/${TMXBASE}.$$t \
|
||||
DEV_SRC=$$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$s \
|
||||
DEV_TRG=$$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$t \
|
||||
TEST_SRC=$$s-$$t/${TMXBASE}/test/${TMXBASE}.$$s \
|
||||
TEST_TRG=$$s-$$t/${TMXBASE}/test/${TMXBASE}.$$t \
|
||||
all
|
||||
|
||||
|
||||
@ -244,11 +256,15 @@ endif
|
||||
.PHONY: data
|
||||
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
||||
|
||||
.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
||||
mkdir -p ${dir $@}
|
||||
cp ${BASEMODELZIP} ${dir $@}
|
||||
cd ${dir $@} && unzip -u *.zip
|
||||
ifneq (${BASEMODELZIP},)
|
||||
wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
|
||||
cd ${dir $@} && unzip -u ${BASEMODELZIP}
|
||||
else
|
||||
@echo "no model found for ${LANGAIR}!"
|
||||
endif
|
||||
|
||||
|
||||
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
|
230058
finetune/VNK-Hallituksen_vuosikertomus.tmx
Normal file
230058
finetune/VNK-Hallituksen_vuosikertomus.tmx
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user