From 6f2c9d8bd53c70da52a20d03ec61245250c4976d Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Fri, 25 Mar 2022 13:53:57 +0200 Subject: [PATCH 1/2] fixed bug in filtering forward translation recipe --- lib/generic.mk | 3 ++- lib/projects/elg.mk | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lib/generic.mk b/lib/generic.mk index bda8633a..75263504 100644 --- a/lib/generic.mk +++ b/lib/generic.mk @@ -285,7 +285,8 @@ FT_SELECTED ?= 95 @for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \ - if [ ! -e `ls ${FORWARDTRANS_HOME}/$$s-$$t/latest/*.best${FT_SELECTED}.gz | head -1` ]; then \ + if [ `ls ${FORWARDTRANS_HOME}/$$s-$$t/latest/ | grep "best${FT_SELECTED}.gz" | wc -l` -eq 0 ]; then \ + echo "... extract best translations from $$s-$$t forward translations"; \ ${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \ RETAIN=${FT_SELECTED} extract-best-translations; \ fi \ diff --git a/lib/projects/elg.mk b/lib/projects/elg.mk index 4a176d5d..8f94e1fa 100644 --- a/lib/projects/elg.mk +++ b/lib/projects/elg.mk @@ -86,9 +86,14 @@ elg-ukr-students: elg-test-tiny2: ${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=eng test-tiny11-student ${MAKE} EMAIL= SRCLANGS=eng TRGLANGS=ukr test-tiny11-student - ${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-bt-nopar test-tiny11-student + ${MAKE} EMAIL= SRCLANGS=eng TRGLANGS=ukr STUDENT_DATA=ftbest-bt-nopar test-tiny11-student + ${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr test-tiny11-student ${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr STUDENT_DATA=ftbest-bt-nopar test-tiny11-student ${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr STUDENT_DATA=ftbest-ftmono-nopar test-tiny11-student + ${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr STUDENT_DATA=pft-pbt-bt test-tiny11-student + ${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu test-tiny11-student + ${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-bt-nopar test-tiny11-student + ${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-ftmono-nopar test-tiny11-student ${MAKE} EMAIL= SRCLANGS="ces slk" TRGLANGS=ukr STUDENT_DATA=pft-pbt-bt test-tiny11-student ${MAKE} EMAIL= SRCLANGS=gmq TRGLANGS=ukr STUDENT_DATA=pft-pbt-bt test-tiny11-student @@ -229,6 +234,24 @@ elg-ukr2deu-student3: +elg-fin2ukr-student: + ${MAKE} SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + +elg-ukr2fin-student: + ${MAKE} SRCLANGS=ukr TRGLANGS=fin CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + +elg-zle2fin-tiny11: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus \ + DATA_PREPARE_HPCPARAMS='${DATA_PREPARE_HPCPARAMS} CPUJOB_HPC_DISK=1000' \ + DATA_ALIGN_HPCPARAMS="${DATA_ALIGN_HPCPARAMS} CPUJOB_HPC_DISK=1000" \ + CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 \ + STUDENT_DATA=ftbest-ftmono-nopar SRCLANGS="ukr rus" TRGLANGS=fin \ + LANGPAIRSTR="zle-ukr" train-tiny11-student + + + + + elg-spa2ukr-student: ${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=spa TRGLANGS=ukr train-tiny11-student @@ -252,6 +275,9 @@ elg-ukr2eng-student: elg-eng2ukr-student2: ${MAKE} MARIAN_EARLY_STOPPING=15 STUDENT_DATA=ftbest-bt-nopar SRCLANGS=eng TRGLANGS=ukr train-tiny11-student +elg-ukr2eng-student2: + ${MAKE} CONTINUE_EXISTING=1 MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=eng STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + ## missing evaluations and dist packages @@ -282,6 +308,11 @@ elg-dist-missing: +elg-zle2fin-pivot: + ${MAKE} MODELTYPE=transformer-big tatoeba-zle2fin-trainjob-pbt-pft-bt + + + elg-continue-missing: for l in deu fra ita por spa; do \ ${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2zle-trainjob; \ From d45e7aaf7c5f1eaef70a43b4fa0ff0de08517630 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Sat, 9 Apr 2022 20:12:03 +0300 Subject: [PATCH 2/2] changes on mahti --- lib/config.mk | 4 ++-- lib/data.mk | 16 ++++++++++++++-- lib/env/mahti.mk | 1 + lib/generic.mk | 5 +++++ lib/projects/elg.mk | 38 ++++++++++++++++++++++++++++++++------ lib/tasks.mk | 12 ++++++++++++ tatoeba/Makefile | 12 +++++++++++- 7 files changed, 77 insertions(+), 11 deletions(-) diff --git a/lib/config.mk b/lib/config.mk index 48db8578..75ecfc86 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -557,8 +557,8 @@ ifeq (${CONTINUE_EXISTING},1) MODEL_LATEST = $(firstword \ ${shell ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \ ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \ - ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.best-perplexity.npz \ - ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.best-perplexity.npz \ + ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz.best-perplexity.npz \ + ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz.best-perplexity.npz \ 2>/dev/null | grep -v 'tuned4' }) MODEL_LATEST_VOCAB = $(shell echo "${MODEL_LATEST}" | \ sed 's|\.${PRE_SRC}-${PRE_TRG}\..*$$|.${PRE_SRC}-${PRE_TRG}.vocab.yml|') diff --git a/lib/data.mk b/lib/data.mk index 9d5fc265..4f9af99c 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -119,6 +119,14 @@ ifeq (${USE_PIVOTING},1) endif +# additional data sets that might be available ... +ifeq (${USE_EXTRA_BITEXTS},1) + EXTRA_BITEXTS_SRC = ${sort ${wildcard ${DATADIR}/extra/${SRC}-${TRG}/*.${SRCEXT}.gz}} + EXTRA_BITEXTS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${EXTRA_BITEXTS_SRC}} +endif + + + print-datasets: -@for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ @@ -144,6 +152,10 @@ print-datasets-current-langpair: @echo "pivot-based translation data:" @echo ${PIVOTING_SRC} @echo ${PIVOTING_TRG} + @echo "extra bitexts:" + @echo ${EXTRA_BITEXTS_SRC} + @echo ${EXTRA_BITEXTS_TRG} + ##------------------------------------------------------------- ## data sets (train/dev/test) @@ -153,9 +165,9 @@ print-datasets-current-langpair: ## with some basic pre-processing (see lib/preprocess.mk) CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRCEXT}.gz,${TRAINSET}} \ - ${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC} + ${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC} ${EXTRA_BITEXTS_SRC} CLEAN_TRAIN_TRG = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${TRGEXT}.gz,${TRAINSET}} \ - ${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG} + ${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG} ${EXTRA_BITEXTS_TRG} CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_DEVDATA_TYPE}.${SRCEXT}.gz,${DEVSET}} CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}} diff --git a/lib/env/mahti.mk b/lib/env/mahti.mk index 98ecb8d2..a752e711 100644 --- a/lib/env/mahti.mk +++ b/lib/env/mahti.mk @@ -8,6 +8,7 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_JOBS=20 CPUJOB_HPC_MEM=128g +# CSCPROJECT = project_2002982 CSCPROJECT = project_2002688 # CSCPROJECT = project_2005625 WORKHOME = ${shell realpath ${PWD}/work} diff --git a/lib/generic.mk b/lib/generic.mk index 75263504..0d1df079 100644 --- a/lib/generic.mk +++ b/lib/generic.mk @@ -245,6 +245,11 @@ listallmodels: ${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 SHUFFLE_TRAINING_DATA=1 ${@:-bt=} +## include additional bitexts +%-xb: + ${MAKE} DATASET=${DATASET}+xb USE_EXTRA_BITEXTS=1 SHUFFLE_TRAINING_DATA=1 ${@:-xb=} + + ## adding a pivot language to the model ## --> add pivot language to each side (source and target) ## --> only start the task if the pivot language adds anything on either side diff --git a/lib/projects/elg.mk b/lib/projects/elg.mk index 8f94e1fa..234ea76f 100644 --- a/lib/projects/elg.mk +++ b/lib/projects/elg.mk @@ -190,6 +190,14 @@ elg-ukr2lit-tiny11: ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit train-tiny11-student +elg-pol2ukr-student2: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt-xb SRCLANGS=pol TRGLANGS=ukr train-tiny11-student + +elg-ukr2pol-student2: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt-xb SRCLANGS=ukr TRGLANGS=pol train-tiny11-student + + + elg-ces_slk2ukr-tiny11: @@ -233,22 +241,31 @@ elg-ukr2deu-student3: +elg-fin2ukr-student2: + ${MAKE} SUBWORD_VOCAB_SIZE=16000 MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + elg-fin2ukr-student: - ${MAKE} SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + ${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student elg-ukr2fin-student: - ${MAKE} SRCLANGS=ukr TRGLANGS=fin CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=fin CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student -elg-zle2fin-tiny11: - ${MAKE} MARIAN_EXTRA=--no-restore-corpus \ +elg-zle2fin-student: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 \ DATA_PREPARE_HPCPARAMS='${DATA_PREPARE_HPCPARAMS} CPUJOB_HPC_DISK=1000' \ DATA_ALIGN_HPCPARAMS="${DATA_ALIGN_HPCPARAMS} CPUJOB_HPC_DISK=1000" \ CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 \ STUDENT_DATA=ftbest-ftmono-nopar SRCLANGS="ukr rus" TRGLANGS=fin \ - LANGPAIRSTR="zle-ukr" train-tiny11-student + LANGPAIRSTR="zle-fin-tiny" train-tiny11-student +elg-fin2rus-student: + ${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=rus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + +elg-rus2fin-student: + ${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=rus TRGLANGS=fin CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student + @@ -309,7 +326,10 @@ elg-dist-missing: elg-zle2fin-pivot: - ${MAKE} MODELTYPE=transformer-big tatoeba-zle2fin-trainjob-pbt-pft-bt + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-zle2fin-trainjob-pbt-pft-bt + +elg-fin2zle-pivot: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-fin2zle-trainjob-pbt-pft-bt @@ -404,6 +424,12 @@ elg-eval-big2zle: ${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2zle-eval-testsets; \ done +elg-eng2zle-xb: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-eng2zle-trainjob-bt-xb + +elg-zle2eng-xb: + ${MAKE} MARIAN_EARLY_STOPPING=25 MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-zle2eng-trainjob-bt-xb + elg-pivot-eval: for l in dan swe fin deu ron tur; do \ diff --git a/lib/tasks.mk b/lib/tasks.mk index 1a4668bc..b191059a 100644 --- a/lib/tasks.mk +++ b/lib/tasks.mk @@ -345,3 +345,15 @@ job1-step3: ${MAKE} all-bt + + +print-info: + @echo "model file: ${MODEL_START}" + @echo "source vocab: ${MODEL_SRCVOCAB}" + @echo "target vocab: ${MODEL_TRGVOCAB}" + @echo "final model file: ${MODEL_FINAL}" + @echo "latest compatible model: ${MODEL_LATEST}" + ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \ + ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \ + ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.best-perplexity.npz \ + ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.best-perplexity.npz diff --git a/tatoeba/Makefile b/tatoeba/Makefile index 695308fe..87162f7b 100644 --- a/tatoeba/Makefile +++ b/tatoeba/Makefile @@ -348,7 +348,6 @@ tatoeba-%-train: TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \ all-groupsize-limits; ) - ## start the training job ## - create config file ## - create data sets @@ -383,6 +382,17 @@ tatoeba-%-pivotlang: fi +tatoeba-%-info: + -( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-info,%,$@))); \ + t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-info,%,$@))); \ + S="${call find-srclanggroup,${patsubst tatoeba-%-info,%,$@},${PIVOT}}"; \ + T="${call find-trglanggroup,${patsubst tatoeba-%-info,%,$@},${PIVOT}}"; \ + ${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \ + TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \ + TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \ + print-info; ) + + ## evaluate with the model-specific test set tatoeba-%-eval: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-eval,%,$@))); \