mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
finetune branch downloads models from object storage
This commit is contained in:
parent
325b4c1903
commit
1a7cbbb13e
@ -697,6 +697,24 @@ add-to-local-mono-data:
|
|||||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||||
|
|
||||||
|
## remove all spaces (treat everything as a long string)
|
||||||
|
%.nospace: %.raw
|
||||||
|
$(LOAD_MOSES) cat $< |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||||
|
sed 's/ /▁/g' > $@
|
||||||
|
|
||||||
|
%.nospace.gz: %.gz
|
||||||
|
$(LOAD_MOSES) zcat $< |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||||
|
sed 's/ /▁/g' |\
|
||||||
|
gzip -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## increase max number of tokens to 250
|
## increase max number of tokens to 250
|
||||||
@ -840,6 +858,8 @@ endif
|
|||||||
## if a new data set is used
|
## if a new data set is used
|
||||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
||||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
||||||
|
# SPMEXTRA = --split_by_whitespace=false
|
||||||
|
SPMEXTRA =
|
||||||
|
|
||||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||||
|
|
||||||
@ -854,11 +874,11 @@ else
|
|||||||
endif
|
endif
|
||||||
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
|
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
|
||||||
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
else \
|
else \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||||
fi
|
fi
|
||||||
@ -880,11 +900,11 @@ ifeq ($(wildcard ${SPMTRGMODEL}),)
|
|||||||
grep . $< | shuf > $<.text
|
grep . $< | shuf > $<.text
|
||||||
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
|
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
|
||||||
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
|
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
else \
|
else \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||||
fi
|
fi
|
||||||
@ -962,11 +982,11 @@ ifeq ($(wildcard ${SPMMODEL}),)
|
|||||||
grep . $< | shuf > $<.text
|
grep . $< | shuf > $<.text
|
||||||
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||||
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
|
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
else \
|
else \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||||
fi
|
fi
|
||||||
|
@ -133,6 +133,16 @@ endif
|
|||||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
${@:-spm=}
|
${@:-spm=}
|
||||||
|
|
||||||
|
## sentence-piece models with space-separated strings
|
||||||
|
%-nospace:
|
||||||
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-nospace} \
|
||||||
|
PRE=simple \
|
||||||
|
SPMEXTRA=--split_by_whitespace=false \
|
||||||
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||||
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
|
${@:-nospace=}
|
||||||
|
|
||||||
|
|
||||||
## with SPM models trained on monolingual data
|
## with SPM models trained on monolingual data
|
||||||
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
|
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
|
||||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
|
||||||
|
@ -339,14 +339,17 @@ memad:
|
|||||||
for s in fi en sv de fr nl; do \
|
for s in fi en sv de fr nl; do \
|
||||||
for t in en fi sv de fr nl; do \
|
for t in en fi sv de fr nl; do \
|
||||||
if [ "$$s" != "$$t" ]; then \
|
if [ "$$s" != "$$t" ]; then \
|
||||||
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/*.valid${NR.log}; then\
|
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/${DATASET}.*.valid${NR.log}; then\
|
||||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t traindata devdata wordalign; \
|
${MAKE} SRCLANGS=$$s TRGLANGS=$$t bilingual-dynamic; \
|
||||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
|
|
||||||
fi \
|
fi \
|
||||||
fi \
|
fi \
|
||||||
done \
|
done \
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t data; \
|
||||||
|
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
doclevel:
|
doclevel:
|
||||||
${MAKE} ost-datasets
|
${MAKE} ost-datasets
|
||||||
|
@ -46,17 +46,23 @@ TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}}
|
|||||||
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
|
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
|
||||||
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
|
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
|
||||||
|
|
||||||
|
OBJECTSTORAGE = https://object.pouta.csc.fi
|
||||||
|
MODELCONTAINER = OPUS-MT-models
|
||||||
|
MODELINDEX = ${OBJECTSTORAGE}/${MODELCONTAINER}/index.txt
|
||||||
|
|
||||||
BASEMODELHOME = ../models/${LANGPAIR}
|
# BASEMODELHOME = ../models/${LANGPAIR}
|
||||||
BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
|
# BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
|
||||||
BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
|
# BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
|
||||||
|
|
||||||
|
BASEMODELHOME = ${OBJECTSTORAGE}/${MODELCONTAINER}/${LANGPAIR}
|
||||||
|
BASEMODELZIP = ${lastword ${sort ${notdir ${shell grep '${LANGPAIR}/opus-.*\.zip' model-index.txt}}}}
|
||||||
|
BASEMODELNAME = ${BASEMODELZIP:.zip=}
|
||||||
|
|
||||||
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
|
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
|
||||||
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
|
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
|
||||||
|
|
||||||
|
|
||||||
MARIAN_WORKSPACE = 5000
|
MARIAN_WORKSPACE = 5000
|
||||||
MARIAN_VALID_FREQ = 100
|
MARIAN_VALID_FREQ = 100
|
||||||
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
|
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
|
||||||
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
|
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
|
||||||
@ -64,11 +70,15 @@ MARIAN_EARLY_STOPPING = 5
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
all: model-index.txt
|
||||||
|
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
||||||
|
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
||||||
|
|
||||||
|
|
||||||
|
model-index.txt:
|
||||||
|
wget -nv -O $@ ${MODELINDEX}
|
||||||
|
|
||||||
|
|
||||||
## convert a TMX file to create dev-test-train data
|
## convert a TMX file to create dev-test-train data
|
||||||
## and start fine-tuning in the direction of sorted lang-IDs
|
## and start fine-tuning in the direction of sorted lang-IDs
|
||||||
@ -79,52 +89,54 @@ all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.ba
|
|||||||
## - it assumes that ${TMX} points to a valid TMX files
|
## - it assumes that ${TMX} points to a valid TMX files
|
||||||
## - it assumes that there are only 2 languages in the TMX (it will only use 2)
|
## - it assumes that there are only 2 languages in the TMX (it will only use 2)
|
||||||
|
|
||||||
TMX = vero-20200123.tmx.gz
|
|
||||||
|
TMXFILE = VNK-Hallituksen_vuosikertomus.tmx
|
||||||
|
TMXBASE = ${TMXFILE:.tmx=}
|
||||||
REVERSE = 0
|
REVERSE = 0
|
||||||
|
|
||||||
tmx-tune:
|
tmx-tune:
|
||||||
zcat ${TMX} |\
|
cat ${TMXFILE} |\
|
||||||
tmx2moses -r -o ${TMX:.tmx.gz=}
|
tmx2moses -r -o ${TMXBASE}
|
||||||
if [ ${REVERSE} -gt 0 ]; then \
|
if [ ${REVERSE} -gt 0 ]; then \
|
||||||
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
t=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||||
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
s=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||||
else \
|
else \
|
||||||
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
s=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||||
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
t=`ls ${TMXBASE}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||||
fi; \
|
fi; \
|
||||||
echo $$s; echo $$t; \
|
echo $$s; echo $$t; \
|
||||||
mkdir -p $$s-$$t; \
|
mkdir -p $$s-$$t; \
|
||||||
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
|
paste ${TMXBASE}.*-*.$$s ${TMXBASE}.*-*.$$t | \
|
||||||
sort | uniq | \
|
sort | uniq | \
|
||||||
python3 ../bitext-match-lang.py -s $$s -t $$t | \
|
python3 ../bitext-match-lang.py -s $$s -t $$t | \
|
||||||
grep -v '[<>{}]' |\
|
grep -v '[<>{}]' |\
|
||||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||||
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
|
shuf > ${TMXBASE}.$$s-$$t.shuffled; \
|
||||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
|
mkdir -p $$s-$$t/${TMXBASE}/dev; \
|
||||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
|
mkdir -p $$s-$$t/${TMXBASE}/test; \
|
||||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \
|
mkdir -p $$s-$$t/${TMXBASE}/train; \
|
||||||
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
head -1000 ${TMXBASE}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||||
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \
|
> $$s-$$t/${TMXBASE}/test/${TMXBASE}.$$s.gz; \
|
||||||
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
head -1000 ${TMXBASE}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||||
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \
|
> $$s-$$t/${TMXBASE}/test/${TMXBASE}.$$t.gz; \
|
||||||
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
|
head -2001 ${TMXBASE}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
|
||||||
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \
|
> $$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$s.gz; \
|
||||||
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
|
head -2001 ${TMXBASE}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
|
||||||
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \
|
> $$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$t.gz; \
|
||||||
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
tail -n +2002 ${TMXBASE}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||||
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \
|
> $$s-$$t/${TMXBASE}/train/${TMXBASE}.$$s.gz; \
|
||||||
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
tail -n +2002 ${TMXBASE}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||||
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \
|
> $$s-$$t/${TMXBASE}/train/${TMXBASE}.$$t.gz; \
|
||||||
mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \
|
mv ${TMXBASE}.*-* $$s-$$t/; \
|
||||||
${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \
|
${MAKE} SRC=$$s TRG=$$t MODEL=${TMXBASE} \
|
||||||
TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \
|
TRAIN_SRC=$$s-$$t/${TMXBASE}/train/${TMXBASE}.$$s \
|
||||||
TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \
|
TRAIN_TRG=$$s-$$t/${TMXBASE}/train/${TMXBASE}.$$t \
|
||||||
DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \
|
DEV_SRC=$$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$s \
|
||||||
DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \
|
DEV_TRG=$$s-$$t/${TMXBASE}/dev/${TMXBASE}.$$t \
|
||||||
TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \
|
TEST_SRC=$$s-$$t/${TMXBASE}/test/${TMXBASE}.$$s \
|
||||||
TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \
|
TEST_TRG=$$s-$$t/${TMXBASE}/test/${TMXBASE}.$$t \
|
||||||
all
|
all
|
||||||
|
|
||||||
|
|
||||||
@ -244,11 +256,15 @@ endif
|
|||||||
.PHONY: data
|
.PHONY: data
|
||||||
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
||||||
|
|
||||||
.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
|
||||||
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
cp ${BASEMODELZIP} ${dir $@}
|
ifneq (${BASEMODELZIP},)
|
||||||
cd ${dir $@} && unzip -u *.zip
|
wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
|
||||||
|
cd ${dir $@} && unzip -u ${BASEMODELZIP}
|
||||||
|
else
|
||||||
|
@echo "no model found for ${LANGAIR}!"
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
||||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
|
230058
finetune/VNK-Hallituksen_vuosikertomus.tmx
Normal file
230058
finetune/VNK-Hallituksen_vuosikertomus.tmx
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user