mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-27 11:03:13 +03:00
Merge remote-tracking branch 'origin/elg-mahti' into puhti
This commit is contained in:
commit
1d802e13f9
@ -502,10 +502,9 @@ ifeq (${CLEAN_CORPUS_TRAINING_DATA},1)
|
||||
${LOCAL_TRAIN_SRC} $(SRCEXT) $(TRGEXT) \
|
||||
${LOCAL_TRAIN_SRC}.clean \
|
||||
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
|
||||
@mv -f ${LOCAL_TRAIN_SRC}.clean,${SRCEXT} ${LOCAL_TRAIN_SRC}
|
||||
@mv -f ${LOCAL_TRAIN_SRC}.clean,${TRGEXT} ${LOCAL_TRAIN_TRG}
|
||||
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT}
|
||||
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${TRGEXT}
|
||||
@mv -f ${LOCAL_TRAIN_SRC}.clean.${SRCEXT} ${LOCAL_TRAIN_SRC}
|
||||
@mv -f ${LOCAL_TRAIN_SRC}.clean.${TRGEXT} ${LOCAL_TRAIN_TRG}
|
||||
@rm -f ${LOCAL_TRAIN_SRC}.${SRCEXT} ${LOCAL_TRAIN_SRC}.${TRGEXT}
|
||||
endif
|
||||
ifeq (${SHUFFLE_TRAINING_DATA},1)
|
||||
@echo ".... shuffle complete training data"
|
||||
|
@ -125,7 +125,7 @@ elg-ukr2fin-tiny11:
|
||||
|
||||
|
||||
elg-gmq2ukr-tiny11:
|
||||
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student
|
||||
|
||||
|
||||
|
||||
@ -166,10 +166,10 @@ elg-ukr2lit-tiny11:
|
||||
|
||||
|
||||
elg-deu2ukr-tiny11:
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student
|
||||
|
||||
elg-ukr2deu-tiny11:
|
||||
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=deu train-tiny11-student
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 TRGLANGS=deu train-tiny11-student
|
||||
|
||||
|
||||
|
||||
|
@ -68,11 +68,32 @@ SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
||||
PWD := $(shell pwd)
|
||||
|
||||
|
||||
## new style of finding the best model for a given language pair:
|
||||
## use the leaderboard in this repository!
|
||||
## - use all benchmarks
|
||||
## - take the one that is mentioned the most
|
||||
## - count results from Tatoeba only once
|
||||
## Problem: only the first tatoeba benchmark will be used
|
||||
## (but this will be the oldest one)
|
||||
|
||||
LEADERBOARD_HOME = ../../scores
|
||||
|
||||
best-opusmt-model = ${shell grep -H . ${LEADERBOARD_HOME}/${1}/*/${2}.txt | \
|
||||
sed 's/txt:[0-9\.]*//' | sed -r 's/tatoeba-test-v[0-9]{4}-[0-9]{2}-[0-9]{2}/tatoeba-test/' | \
|
||||
rev | uniq -f1 | rev | cut -f2 | uniq -c | sort -nr | head -1 | sed 's/^.*http/http/'}
|
||||
MODELZIP := ${call best-opusmt-model,${LANGPAIR},bleu-scores}
|
||||
MODELINFO := ${MODELZIP:.zip=.yml}
|
||||
MODELNAME := ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
|
||||
## old style: query the list of released models
|
||||
## problems with that ; the first one on the list might not be the best one
|
||||
|
||||
# MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
|
||||
# MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
|
||||
# MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
|
||||
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
|
||||
MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
|
||||
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
|
||||
ifneq (${MULTI_TARGET_MODEL},0)
|
||||
@ -154,10 +175,14 @@ print-modelname:
|
||||
##-------------------------------------------
|
||||
|
||||
REV_LANGPAIR = ${TRG}-${SRC}
|
||||
REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
|
||||
REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}}
|
||||
REV_MODELZIP := ${call best-opusmt-model,${REV_LANGPAIR},bleu-scores}
|
||||
REV_MODELINFO := ${REV_MODELZIP:.zip=.yml}
|
||||
REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
|
||||
|
||||
# REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
|
||||
# REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}}
|
||||
# REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
|
||||
|
||||
REV_MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l}
|
||||
ifeq (${REV_MULTI_TARGET_MODEL},1)
|
||||
REV_SRC_PREPROCESS_ARGS = ${TRG} ${SRC} ${REV_LANGPAIR}/${REV_MODELNAME}/source.spm
|
||||
|
Loading…
Reference in New Issue
Block a user