diff --git a/lib/config.mk b/lib/config.mk index c690927e..9b6aa8b6 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -480,6 +480,7 @@ MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR} MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz +MODEL_DONE = ${WORKDIR}/${MODEL_BASENAME}.done MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz MODEL_DECODER = ${MODEL_FINAL}.decoder.yml @@ -652,8 +653,10 @@ endif ## decoder flags (CPU and GPU variants) MARIAN_BEAM_SIZE = 4 -MARIAN_MINI_BATCH = 512 -MARIAN_MAXI_BATCH = 1024 +MARIAN_MINI_BATCH = 256 +MARIAN_MAXI_BATCH = 512 +# MARIAN_MINI_BATCH = 512 +# MARIAN_MAXI_BATCH = 1024 # MARIAN_MINI_BATCH = 768 # MARIAN_MAXI_BATCH = 2048 diff --git a/lib/dist.mk b/lib/dist.mk index 24158da0..10bb439b 100644 --- a/lib/dist.mk +++ b/lib/dist.mk @@ -88,6 +88,19 @@ endif fi +## only create the release if the model has converged (done-flag exists) +.PHONY: release-if-done +release-if-done: +ifneq (${wildcard ${MODEL_DONE}},) + @${MAKE} release +else + @echo "... not ready yet (${MODEL_DONE})" +endif + + + + + .PHONY: scores scores: ${MAKE} FIND_EVAL_FILES=1 ${WORKHOME}/eval/scores.txt @@ -423,10 +436,10 @@ endif link-latest-model: - if [ `ls ${patsubst %.zip,%-*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \ + if [ `ls ${patsubst %.zip,%_*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \ rm -f ${DIST_PACKAGE}; \ cd ${dir ${DIST_PACKAGE}}; \ - ln -s `ls -t ${patsubst %.zip,%-*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \ + ln -s `ls -t ${patsubst %.zip,%_*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \ ${notdir ${DIST_PACKAGE}}; \ fi diff --git a/lib/projects/elg.mk b/lib/projects/elg.mk index 807e4d4c..025f95f2 100644 --- a/lib/projects/elg.mk +++ b/lib/projects/elg.mk @@ -29,14 +29,35 @@ ELG_EU_LANGIDS = eng deu swe fin nld dan spa ces fra pol por lav ron est bul ell slk ita mlt slv hrv lit gle hun -ELG_EU_SELECTED = gmq nld pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi fin +ELG_EU_SELECTED = nld pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus tur ara heb sqi fin ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci spa" "por glg" -ELG_EU_SELECTED_BIG = spa fra deu +ELG_EU_SELECTED_BIG = gmq zle zls zlw spa fra deu # "fry ltz nds afr" # "cat oci" + +elg-eval: + for l in ${ELG_EU_SELECTED} ${ELG_EU_SELECTED_BIG}; do \ + if [ -e ${wildcard work/eng-$$l/*.npz} ]; then \ + ${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-eval-bt; \ + ${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-multieval-bt; \ + ${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-eval-testsets-bt; \ + fi; \ + if [ -e ${wildcard work/$${l}-eng/*.npz} ]; then \ + ${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-eval-bt; \ + ${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-multieval-bt; \ + ${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-eval-testsets-bt; \ + fi; \ + done + for l in ${ELG_EU_SELECTED_MULTILANG}; do \ + ${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng eval-bt-tatoeba; \ + ${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng eval-testsets-bt-tatoeba; \ + done + + + elg-eng2all: for l in ${ELG_EU_SELECTED_MULTILANG}; do \ ${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" \ @@ -71,6 +92,7 @@ elg-all2eng: + elg-eng2all-eval1: for l in ${ELG_EU_SELECTED_MULTILANG}; do \ ${MAKE} WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-sublang-eval-bt.submit; \ @@ -95,6 +117,68 @@ elg-all2eng-eval: + + + + + +elg-eng2cel: + ${MAKE} MODELTYPE=transformer-big \ + CLEAN_TRAINDATA_TYPE=clean \ + CLEAN_DEVDATA_TYPE=clean \ + tatoeba-eng2cel-trainjob-bt + + +elg-ara2eng: + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \ + tatoeba-ara2eng-trainjob-bt + +elg-zle2eng: + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \ + tatoeba-zle2eng-trainjob-bt + +elg-multi2eng: + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \ + SRCLANGS=eng TRGLANGS="cat oci spa" \ + tatoeba-job-bt + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \ + SRCLANGS=eng TRGLANGS="por glg" \ + tatoeba-job-bt + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \ + TRGLANGS=eng SRCLANGS="cat oci spa" \ + tatoeba-job-bt + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \ + TRGLANGS=eng SRCLANGS="por glg" \ + tatoeba-job-bt + + +elg-ces2eng: + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + TRGLANGS=eng SRCLANGS="ces slk" \ + tatoeba-job-bt + +elg-eng2ces: + ${MAKE} MODELTYPE=transformer-big \ + MARIAN_EXTRA=--no-restore-corpus \ + SRCLANGS=eng TRGLANGS="ces slk" \ + tatoeba-job-bt + + + + ## test with separate vocabs elg-eng2slv: ${MAKE} MODELTYPE=transformer-big tatoeba-eng2slv-trainjob-bt-separate-spm; \ diff --git a/lib/tasks.mk b/lib/tasks.mk index fe17c2df..1a4668bc 100644 --- a/lib/tasks.mk +++ b/lib/tasks.mk @@ -109,7 +109,7 @@ ifdef SLURM_JOBID echo "reached maximum number of repeated slurm jobs: ${SLURM_REPEAT}"; \ fi endif - ${MAKE} ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done + ${MAKE} ${MODEL_DONE} vocab: ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} translate: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG} diff --git a/lib/test.mk b/lib/test.mk index 3f7dc871..41f54500 100644 --- a/lib/test.mk +++ b/lib/test.mk @@ -126,11 +126,16 @@ print-bleu-scores: LEADERBOARD_DIR = ${REPOHOME}scores +## manipulating test set names is really messy +## - need to remove language ID pairs +## - could be different variants (2-lettter codes, 3-letter codes) +## - newstest sometimes has additional langpair-IDs in their names + compare-bleu-score-table: @grep BLEU ${WORKHOME}/*/*.eval |\ perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\ perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\ - perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(news.*)\-[a-z]{6}/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\ + perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(.*)\-[a-z]{4}$$/$$1/;$$a[2]=~s/^(.*)\-[a-z]{6}$$/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\ sort -k5,5 -k1,1 -k2,2nr compare-bleu-scores: @@ -145,3 +150,26 @@ print-decreased-models: @make -s compare-bleu-scores |\ grep ' -[0-9]' + + + +## compare BLEU scores for the current model + +compare-model-bleu-score-table: + @grep BLEU ${WORKDIR}/*.eval |\ + perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\ + perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\ + perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(.*)\-[a-z]{4}$$/$$1/;$$a[2]=~s/^(.*)\-[a-z]{6}$$/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}$$/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\ + sort -k5,5 -k1,1 -k2,2nr + +compare-model-bleu-scores: + make -s compare-model-bleu-score-table |\ + perl -e 'printf "%15s %5s %5s %6s %-25s %-15s %s","langpair","BLEU","best","diff","testset","dir","model\n";while (<>){@a=split(/\t/);printf "%15s %5.2f %5.2f %6.2f %-25s %-15s %s",@a;}' + +print-improved-bleu: + @make -s compare-model-bleu-scores |\ + grep -v ' 0.00' | grep -v ' -[0-9]' + +print-decreased-bleu: + @make -s compare-model-bleu-scores |\ + grep ' -[0-9]' diff --git a/models/de-sk/README.md b/models/de-sk/README.md new file mode 100644 index 00000000..cc8d2bd0 --- /dev/null +++ b/models/de-sk/README.md @@ -0,0 +1,37 @@ +# opus_transformer-align_2022-02-19.zip + +* dataset: opus +* model: transformer-align +* source language(s): de +* target language(s): sk +* raw source language(s): de +* raw target language(s): sk +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus_transformer-align_2022-02-19.zip](https://object.pouta.csc.fi/OPUS-MT-models/de-sk/opus_transformer-align_2022-02-19.zip) +## Training data: opus + +* de-sk: CCMatrix.de-sk.strict (24693931) DGT.de-sk.strict (3897093) ECB.de-sk.strict (86068) ELITR-ECA.de-sk.strict (43365) EMEA.de-sk.strict (747856) EUbookshop.de-sk.strict (335128) EUconst.de-sk.strict (6115) GNOME.de-sk.strict (127) JRC-Acquis.de-sk.strict (28389) KDE4.de-sk.strict (72205) KDEdoc.de-sk.strict (10801) MultiCCAligned.de-sk.strict (2415997) MultiParaCrawl.de-sk.strict (5281070) OpenSubtitles.de-sk.strict (3287374) PHP.de-sk.strict (25834) QED.de-sk.strict (130746) TED2020.de-sk.strict (93826) TildeMODEL.de-sk.strict (2011415) Ubuntu.de-sk.strict (1859) WikiMatrix.de-sk.strict (91079) XLEnt.de-sk.strict (229529) bible-uedin.de-sk.strict (30627) wikimedia.de-sk.strict (410) +* de-sk: total size = 43520844 +* unused dev/test data is added to training data +* total size (opus): 43548640 + + +## Validation data + +* de-sk: Europarl, 563387 +* total-size-shuffled: 550326 + +* devset-selected: top 2500 lines of Europarl.src.shuffled +* testset-selected: next 2500 lines of Europarl.src.shuffled +* devset-unused: added to traindata + +* test set translations: [opus_transformer-align_2022-02-19.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/de-sk/opus_transformer-align_2022-02-19.test.txt) +* test set scores: [opus_transformer-align_2022-02-19.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/de-sk/opus_transformer-align_2022-02-19.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | #sent | #words | BP | +|---------|-------|-------|-------|--------|----| +| Europarl.de-sk | 29.2 | 0.56574 | 2500 | 59265 | 0.977 | + diff --git a/tatoeba/Makefile b/tatoeba/Makefile index 32eee9fe..4eafb0c3 100644 --- a/tatoeba/Makefile +++ b/tatoeba/Makefile @@ -202,6 +202,25 @@ print-langgroups: + +## release all impoved models +## - check leaderboard scores +## - get all models that have at least one improved BLEU score +## - make a release if the model is done +## +## caveat: does not check for model parameters/types etc! + +release-improved-models: + ( p=`make -s compare-bleu-score-table-tatoeba | \ + grep -v ' 0 ' | grep -v ' -[0-9]' | \ + cut -f6 | sort -u | xargs`; \ + for l in $$p; do \ + s=`echo "$$l" | cut -f1 -d-`; \ + t=`echo "$$l" | cut -f2 -d-`; \ + make SRCLANGS="$$s" TRGLANGS="$$t" release-if-done-tatoeba; \ + done ) + + ############################################################################### ## generic targets for evaluating multilingual models (all supported lang-pairs) ###############################################################################