mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
linking latest release is now correct
This commit is contained in:
parent
45cb8329ee
commit
e94c43062a
@ -480,6 +480,7 @@ MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
|
||||
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
|
||||
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
|
||||
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
|
||||
MODEL_DONE = ${WORKDIR}/${MODEL_BASENAME}.done
|
||||
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
|
||||
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
|
||||
@ -652,8 +653,10 @@ endif
|
||||
## decoder flags (CPU and GPU variants)
|
||||
|
||||
MARIAN_BEAM_SIZE = 4
|
||||
MARIAN_MINI_BATCH = 512
|
||||
MARIAN_MAXI_BATCH = 1024
|
||||
MARIAN_MINI_BATCH = 256
|
||||
MARIAN_MAXI_BATCH = 512
|
||||
# MARIAN_MINI_BATCH = 512
|
||||
# MARIAN_MAXI_BATCH = 1024
|
||||
# MARIAN_MINI_BATCH = 768
|
||||
# MARIAN_MAXI_BATCH = 2048
|
||||
|
||||
|
17
lib/dist.mk
17
lib/dist.mk
@ -88,6 +88,19 @@ endif
|
||||
fi
|
||||
|
||||
|
||||
## only create the release if the model has converged (done-flag exists)
|
||||
.PHONY: release-if-done
|
||||
release-if-done:
|
||||
ifneq (${wildcard ${MODEL_DONE}},)
|
||||
@${MAKE} release
|
||||
else
|
||||
@echo "... not ready yet (${MODEL_DONE})"
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: scores
|
||||
scores:
|
||||
${MAKE} FIND_EVAL_FILES=1 ${WORKHOME}/eval/scores.txt
|
||||
@ -423,10 +436,10 @@ endif
|
||||
|
||||
|
||||
link-latest-model:
|
||||
if [ `ls ${patsubst %.zip,%-*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
if [ `ls ${patsubst %.zip,%_*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
rm -f ${DIST_PACKAGE}; \
|
||||
cd ${dir ${DIST_PACKAGE}}; \
|
||||
ln -s `ls -t ${patsubst %.zip,%-*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
|
||||
ln -s `ls -t ${patsubst %.zip,%_*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
|
||||
${notdir ${DIST_PACKAGE}}; \
|
||||
fi
|
||||
|
||||
|
@ -29,14 +29,35 @@
|
||||
|
||||
ELG_EU_LANGIDS = eng deu swe fin nld dan spa ces fra pol por lav ron est bul ell slk ita mlt slv hrv lit gle hun
|
||||
|
||||
ELG_EU_SELECTED = gmq nld pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi fin
|
||||
ELG_EU_SELECTED = nld pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus tur ara heb sqi fin
|
||||
ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci spa" "por glg"
|
||||
ELG_EU_SELECTED_BIG = spa fra deu
|
||||
ELG_EU_SELECTED_BIG = gmq zle zls zlw spa fra deu
|
||||
|
||||
# "fry ltz nds afr"
|
||||
# "cat oci"
|
||||
|
||||
|
||||
|
||||
elg-eval:
|
||||
for l in ${ELG_EU_SELECTED} ${ELG_EU_SELECTED_BIG}; do \
|
||||
if [ -e ${wildcard work/eng-$$l/*.npz} ]; then \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-eval-bt; \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-multieval-bt; \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-eval-testsets-bt; \
|
||||
fi; \
|
||||
if [ -e ${wildcard work/$${l}-eng/*.npz} ]; then \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-eval-bt; \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-multieval-bt; \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-eval-testsets-bt; \
|
||||
fi; \
|
||||
done
|
||||
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng eval-bt-tatoeba; \
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng eval-testsets-bt-tatoeba; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
elg-eng2all:
|
||||
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" \
|
||||
@ -71,6 +92,7 @@ elg-all2eng:
|
||||
|
||||
|
||||
|
||||
|
||||
elg-eng2all-eval1:
|
||||
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
|
||||
${MAKE} WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-sublang-eval-bt.submit; \
|
||||
@ -95,6 +117,68 @@ elg-all2eng-eval:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
elg-eng2cel:
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
CLEAN_TRAINDATA_TYPE=clean \
|
||||
CLEAN_DEVDATA_TYPE=clean \
|
||||
tatoeba-eng2cel-trainjob-bt
|
||||
|
||||
|
||||
elg-ara2eng:
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-ara2eng-trainjob-bt
|
||||
|
||||
elg-zle2eng:
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-zle2eng-trainjob-bt
|
||||
|
||||
elg-multi2eng:
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
SRCLANGS=eng TRGLANGS="cat oci spa" \
|
||||
tatoeba-job-bt
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
SRCLANGS=eng TRGLANGS="por glg" \
|
||||
tatoeba-job-bt
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
TRGLANGS=eng SRCLANGS="cat oci spa" \
|
||||
tatoeba-job-bt
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
TRGLANGS=eng SRCLANGS="por glg" \
|
||||
tatoeba-job-bt
|
||||
|
||||
|
||||
elg-ces2eng:
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
TRGLANGS=eng SRCLANGS="ces slk" \
|
||||
tatoeba-job-bt
|
||||
|
||||
elg-eng2ces:
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
SRCLANGS=eng TRGLANGS="ces slk" \
|
||||
tatoeba-job-bt
|
||||
|
||||
|
||||
|
||||
|
||||
## test with separate vocabs
|
||||
elg-eng2slv:
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2slv-trainjob-bt-separate-spm; \
|
||||
|
@ -109,7 +109,7 @@ ifdef SLURM_JOBID
|
||||
echo "reached maximum number of repeated slurm jobs: ${SLURM_REPEAT}"; \
|
||||
fi
|
||||
endif
|
||||
${MAKE} ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
||||
${MAKE} ${MODEL_DONE}
|
||||
|
||||
vocab: ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
|
||||
translate: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
|
||||
|
30
lib/test.mk
30
lib/test.mk
@ -126,11 +126,16 @@ print-bleu-scores:
|
||||
|
||||
LEADERBOARD_DIR = ${REPOHOME}scores
|
||||
|
||||
## manipulating test set names is really messy
|
||||
## - need to remove language ID pairs
|
||||
## - could be different variants (2-lettter codes, 3-letter codes)
|
||||
## - newstest sometimes has additional langpair-IDs in their names
|
||||
|
||||
compare-bleu-score-table:
|
||||
@grep BLEU ${WORKHOME}/*/*.eval |\
|
||||
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
|
||||
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
|
||||
perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(news.*)\-[a-z]{6}/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\
|
||||
perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(.*)\-[a-z]{4}$$/$$1/;$$a[2]=~s/^(.*)\-[a-z]{6}$$/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\
|
||||
sort -k5,5 -k1,1 -k2,2nr
|
||||
|
||||
compare-bleu-scores:
|
||||
@ -145,3 +150,26 @@ print-decreased-models:
|
||||
@make -s compare-bleu-scores |\
|
||||
grep ' -[0-9]'
|
||||
|
||||
|
||||
|
||||
|
||||
## compare BLEU scores for the current model
|
||||
|
||||
compare-model-bleu-score-table:
|
||||
@grep BLEU ${WORKDIR}/*.eval |\
|
||||
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
|
||||
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
|
||||
perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(.*)\-[a-z]{4}$$/$$1/;$$a[2]=~s/^(.*)\-[a-z]{6}$$/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}$$/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\
|
||||
sort -k5,5 -k1,1 -k2,2nr
|
||||
|
||||
compare-model-bleu-scores:
|
||||
make -s compare-model-bleu-score-table |\
|
||||
perl -e 'printf "%15s %5s %5s %6s %-25s %-15s %s","langpair","BLEU","best","diff","testset","dir","model\n";while (<>){@a=split(/\t/);printf "%15s %5.2f %5.2f %6.2f %-25s %-15s %s",@a;}'
|
||||
|
||||
print-improved-bleu:
|
||||
@make -s compare-model-bleu-scores |\
|
||||
grep -v ' 0.00' | grep -v ' -[0-9]'
|
||||
|
||||
print-decreased-bleu:
|
||||
@make -s compare-model-bleu-scores |\
|
||||
grep ' -[0-9]'
|
||||
|
37
models/de-sk/README.md
Normal file
37
models/de-sk/README.md
Normal file
@ -0,0 +1,37 @@
|
||||
# opus_transformer-align_2022-02-19.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* source language(s): de
|
||||
* target language(s): sk
|
||||
* raw source language(s): de
|
||||
* raw target language(s): sk
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
||||
* download: [opus_transformer-align_2022-02-19.zip](https://object.pouta.csc.fi/OPUS-MT-models/de-sk/opus_transformer-align_2022-02-19.zip)
|
||||
## Training data: opus
|
||||
|
||||
* de-sk: CCMatrix.de-sk.strict (24693931) DGT.de-sk.strict (3897093) ECB.de-sk.strict (86068) ELITR-ECA.de-sk.strict (43365) EMEA.de-sk.strict (747856) EUbookshop.de-sk.strict (335128) EUconst.de-sk.strict (6115) GNOME.de-sk.strict (127) JRC-Acquis.de-sk.strict (28389) KDE4.de-sk.strict (72205) KDEdoc.de-sk.strict (10801) MultiCCAligned.de-sk.strict (2415997) MultiParaCrawl.de-sk.strict (5281070) OpenSubtitles.de-sk.strict (3287374) PHP.de-sk.strict (25834) QED.de-sk.strict (130746) TED2020.de-sk.strict (93826) TildeMODEL.de-sk.strict (2011415) Ubuntu.de-sk.strict (1859) WikiMatrix.de-sk.strict (91079) XLEnt.de-sk.strict (229529) bible-uedin.de-sk.strict (30627) wikimedia.de-sk.strict (410)
|
||||
* de-sk: total size = 43520844
|
||||
* unused dev/test data is added to training data
|
||||
* total size (opus): 43548640
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* de-sk: Europarl, 563387
|
||||
* total-size-shuffled: 550326
|
||||
|
||||
* devset-selected: top 2500 lines of Europarl.src.shuffled
|
||||
* testset-selected: next 2500 lines of Europarl.src.shuffled
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus_transformer-align_2022-02-19.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/de-sk/opus_transformer-align_2022-02-19.test.txt)
|
||||
* test set scores: [opus_transformer-align_2022-02-19.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/de-sk/opus_transformer-align_2022-02-19.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| Europarl.de-sk | 29.2 | 0.56574 | 2500 | 59265 | 0.977 |
|
||||
|
@ -202,6 +202,25 @@ print-langgroups:
|
||||
|
||||
|
||||
|
||||
|
||||
## release all impoved models
|
||||
## - check leaderboard scores
|
||||
## - get all models that have at least one improved BLEU score
|
||||
## - make a release if the model is done
|
||||
##
|
||||
## caveat: does not check for model parameters/types etc!
|
||||
|
||||
release-improved-models:
|
||||
( p=`make -s compare-bleu-score-table-tatoeba | \
|
||||
grep -v ' 0 ' | grep -v ' -[0-9]' | \
|
||||
cut -f6 | sort -u | xargs`; \
|
||||
for l in $$p; do \
|
||||
s=`echo "$$l" | cut -f1 -d-`; \
|
||||
t=`echo "$$l" | cut -f2 -d-`; \
|
||||
make SRCLANGS="$$s" TRGLANGS="$$t" release-if-done-tatoeba; \
|
||||
done )
|
||||
|
||||
|
||||
###############################################################################
|
||||
## generic targets for evaluating multilingual models (all supported lang-pairs)
|
||||
###############################################################################
|
||||
|
Loading…
Reference in New Issue
Block a user