mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 12:32:24 +03:00
language group jobs with some more documentation
This commit is contained in:
parent
62c9414122
commit
7df91a9eaa
@ -117,6 +117,52 @@ Note that this can be quite a lot of language pairs!
|
||||
|
||||
|
||||
|
||||
## Working with language groups
|
||||
|
||||
Language groups are defined according to ISO639-5. The Perl module ISO::639::5 needs to be installed
|
||||
to retrieve the language group hierarchy. Various combinations of language groups and English can be
|
||||
trained using the following commands (note that this starts all combinations, see below for individual jobs):
|
||||
|
||||
```
|
||||
make tatoeba-group2eng # start train jobs for all language groups to English
|
||||
make tatoeba-eng2group # start train jobs for English to all language groups
|
||||
make tatoeba-langgroup # start train jobs for bi-directional models for all language groups
|
||||
```
|
||||
|
||||
Combine all jobs above:
|
||||
|
||||
```
|
||||
make tatoeba-langgroups
|
||||
```
|
||||
|
||||
|
||||
Create release packages from the language group models
|
||||
|
||||
```
|
||||
make tatoeba-group2eng-dist # make package for all trained group2eng models
|
||||
make tatoeba-eng2group-dist # make package for all trained eng2group models
|
||||
make tatoeba-langgroup-dist # make package for all trained langgroup models
|
||||
```
|
||||
|
||||
|
||||
Jobs for specific tasks and language groups; example task: `gmw2eng`:
|
||||
|
||||
```
|
||||
make tateoba-gmw2eng-train # make data and start training job
|
||||
make tateoba-gmw2eng-eval # evaluate model with multilingual test data
|
||||
make tateoba-gmw2eng-evalall # evaluate model with all individual language pairs
|
||||
make tateoba-gmw2eng-dist # create release package
|
||||
```
|
||||
|
||||
Similar jobs can be started for any supported language group from and to English
|
||||
and also as a bidirectional model for all languages in the given language group.
|
||||
Replace `gmw2eng` with, for example, `eng2gem` (English to Germanic) or
|
||||
`gmq` (multilingual model for North Germanic languages).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Generate evaluation tables
|
||||
|
||||
Various lists and tables can be generated from the evaluated model files. Remove old files and generat new ones by running:
|
||||
|
@ -393,21 +393,28 @@ else
|
||||
MARIAN_WORKSPACE = 10000
|
||||
endif
|
||||
|
||||
|
||||
## check whether we have GPUs available
|
||||
## if not: use CPU mode for decoding
|
||||
NVIDIA_SMI = ${shell which nvidia-smi 2>dev/null}
|
||||
ifdef NVIDIA_SMI
|
||||
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
|
||||
MARIAN = ${MARIANCPU}
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
|
||||
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
|
||||
endif
|
||||
else
|
||||
MARIAN = ${MARIANCPU}
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
|
||||
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
## weights associated with training examples
|
||||
ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
|
||||
MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
### training a model with Marian NMT
|
||||
##
|
||||
## NR allows to train several models for proper ensembling
|
||||
|
@ -47,6 +47,32 @@
|
||||
# make tatoeba-multilingual-distsubset-medium .... create release files
|
||||
# make tatoeba-multilingual-evalsubset-medium .... eval all langpairs
|
||||
#---------------------------------------------------------------------
|
||||
# jobs for multilingual language group models
|
||||
#
|
||||
# make tatoeba-group2eng ...... start train jobs for all language groups to English
|
||||
# make tatoeba-eng2group ...... start train jobs for English to all language groups
|
||||
# make tatoeba-langgroup ...... start train jobs for bi-directional models for all language groups
|
||||
#
|
||||
# make tatoeba-langgroups ..... make all jobs from above
|
||||
#
|
||||
#
|
||||
# make tatoeba-group2eng-dist . make package for all trained group2eng models
|
||||
# make tatoeba-eng2group-dist . make package for all trained eng2group models
|
||||
# make tatoeba-langgroup-dist . make package for all trained langgroup models
|
||||
#
|
||||
#
|
||||
# jobs for specific tasks and language groups, example task: "gmw2eng"
|
||||
#
|
||||
# make tateoba-gmw2eng-train .. make data and start training job
|
||||
# make tateoba-gmw2eng-eval ... evaluate model with multilingual test data
|
||||
# make tateoba-gmw2eng-evalall evaluate model with all individual language pairs
|
||||
# make tateoba-gmw2eng-dist ... create release package
|
||||
#
|
||||
# Similar jobs can be started for any supported language group from and to English
|
||||
# and also as a bidirectional model for all languages in the given language group.
|
||||
# Replace "gmw2eng" with, for example, "eng2gem" (English to Germanic) or
|
||||
# "gmq" (multilingual model for North Germanic languages).
|
||||
#---------------------------------------------------------------------
|
||||
#
|
||||
# generate evaluation tables
|
||||
#
|
||||
@ -57,14 +83,14 @@
|
||||
|
||||
## general parameters for Tatoeba models
|
||||
|
||||
TATOEBA_DATAURL = https://object.pouta.csc.fi/Tatoeba-Challenge
|
||||
TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_WORK = ${PWD}/work-tatoeba
|
||||
TATOEBA_DATA = ${TATOEBA_WORK}/data/${PRE}
|
||||
TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
|
||||
TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_WORK := ${PWD}/work-tatoeba
|
||||
TATOEBA_DATA := ${TATOEBA_WORK}/data/${PRE}
|
||||
|
||||
TATOEBA_MODEL_CONTAINER = Tatoeba-MT-models
|
||||
TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models
|
||||
|
||||
TATOEBA_PARAMS = TRAINSET=Tatoeba-train \
|
||||
TATOEBA_PARAMS := TRAINSET=Tatoeba-train \
|
||||
DEVSET=Tatoeba-dev \
|
||||
TESTSET=Tatoeba-test \
|
||||
TESTSET_NAME=Tatoeba-test \
|
||||
@ -80,72 +106,42 @@ TATOEBA_PARAMS = TRAINSET=Tatoeba-train \
|
||||
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
|
||||
ALT_MODEL_DIR=tatoeba \
|
||||
SKIP_DATA_DETAILS=1 \
|
||||
MIN_BLEU_SCORE=10 \
|
||||
MIN_BLEU_SCORE=10
|
||||
|
||||
|
||||
|
||||
|
||||
ISO639 := iso639
|
||||
GET_ISO_CODE := ${ISO639} -m
|
||||
|
||||
## taken from the Tatoeba-Challenge Makefile
|
||||
## requires local data for setting TATOEBA_LANGS
|
||||
|
||||
# EXCLUDE_LANGGROUPS = afa
|
||||
|
||||
ISO639 = iso639
|
||||
GET_ISO_CODE = ${ISO639} -m
|
||||
|
||||
# TATOEBA_LANGS = ${sort ${patsubst %.txt.gz,%,${notdir ${wildcard ${OPUSHOME}/Tatoeba/latest/mono/*.txt.gz}}}}
|
||||
# TATOEBA_LANGS3 = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${TATOEBA_LANGS}}}}
|
||||
# TATOEBA_LANGGROUPS = ${sort ${shell langgroup -p -n ${TATOEBA_LANGS3} 2>/dev/null}}
|
||||
# TATOEBA_LANGGROUPS1 = ${shell langgroup -g -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
|
||||
# TATOEBA_LANGGROUPS2 = ${shell langgroup -G -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
|
||||
|
||||
OPUS_LANGS3 = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
|
||||
OPUS_LANG_PARENTS = ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
|
||||
OPUS_LANG_GRANDPARENTS = ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
|
||||
OPUS_LANG_GROUPS = ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
|
||||
|
||||
# OPUS_LANGGROUPS1 = ${shell langgroup -g -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
|
||||
# OPUS_LANGGROUPS2 = ${shell langgroup -G -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
|
||||
|
||||
OPUS_LANGS3 := ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
|
||||
OPUS_LANG_PARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
|
||||
OPUS_LANG_GRANDPARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
|
||||
OPUS_LANG_GROUPS := ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
|
||||
|
||||
|
||||
###########################################################################################
|
||||
# language groups
|
||||
###########################################################################################
|
||||
|
||||
## print language groups
|
||||
opus-langgroups:
|
||||
@echo ${OPUS_LANG_PARENTS}
|
||||
@echo ${OPUS_LANG_GRANDPARENTS}
|
||||
print-langgroups:
|
||||
@echo ${OPUS_LANG_GROUPS}
|
||||
|
||||
# ## multilingual models for language groups
|
||||
# tatoeba-langgroup:
|
||||
# for g in ${TATOEBA_LANGGROUPS1}; do \
|
||||
# l=`echo $$g | sed 's/\+/ /g'`; \
|
||||
# n=`langgroup -p $$l | cut -f1 -d' '`; \
|
||||
# ${MAKE} LANGPAIRSTR="$$n-$$n" TRGLANGS="$$l" SRCLANGS="$$l" \
|
||||
# MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
|
||||
# done
|
||||
|
||||
# ## models for language groups to English
|
||||
# tatoeba-group2eng:
|
||||
# for g in ${TATOEBA_LANGGROUPS1}; do \
|
||||
# l=`echo $$g | sed 's/\+/ /g'`; \
|
||||
# n=`langgroup -p $$l | cut -f1 -d' '`; \
|
||||
# ${MAKE} LANGPAIRSTR="$$n-eng" SRCLANGS="$$l" TRGLANGS=eng \
|
||||
# MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
|
||||
# done
|
||||
|
||||
# ## models for English to language groups
|
||||
# tatoeba-eng2group:
|
||||
# for g in ${TATOEBA_LANGGROUPS1}; do \
|
||||
# l=`echo $$g | sed 's/\+/ /g'`; \
|
||||
# n=`langgroup -p $$l | cut -f1 -d' '`; \
|
||||
# ${MAKE} LANGPAIRSTR="eng-$$n" TRGLANGS="$$l" SRCLANGS=eng \
|
||||
# MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
|
||||
# done
|
||||
|
||||
|
||||
## start all jobs for all combinations of
|
||||
## - language groups and English (separate in both directions)
|
||||
## - languages in language groups (bi-directional)
|
||||
##
|
||||
## language groups include parents and grandparents
|
||||
|
||||
tatoeba-langgroups:
|
||||
${MAKE} tatoeba-group2eng
|
||||
@ -155,15 +151,15 @@ tatoeba-langgroups:
|
||||
|
||||
#### language-group to English
|
||||
|
||||
GROUP2ENG_JOB = $(patsubst %,tatoeba-%2eng-job,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_TRAIN = $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_EVAL = $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_EVALALL = $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_DIST = $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_EVAL := $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_EVALALL := $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_DIST := $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
|
||||
|
||||
tatoeba-group2eng: ${GROUP2ENG_JOB}
|
||||
# tatoeba-group2eng-dist: ${GROUP2ENG_EVAL} ${GROUP2ENG_EVALALL}
|
||||
# ${MAKE} ${GROUP2ENG_DIST}
|
||||
## start all jobs for language group to English translation
|
||||
tatoeba-group2eng: ${GROUP2ENG_TRAIN}
|
||||
|
||||
## only start this if there is a model
|
||||
tatoeba-group2eng-dist:
|
||||
for g in ${OPUS_LANG_GROUPS}; do \
|
||||
if [ `find ${TATOEBA_WORK}/$$g-eng -name '*.npz' | wc -l` -gt 0 ]; then \
|
||||
@ -173,21 +169,22 @@ tatoeba-group2eng-dist:
|
||||
fi \
|
||||
done
|
||||
|
||||
${GROUP2ENG_JOB}:
|
||||
${MAKE} $(patsubst %-job,%-train,$@)
|
||||
${MAKE} $(patsubst %-job,%-eval,$@)
|
||||
${MAKE} $(patsubst %-job,%-evalall,$@)
|
||||
${MAKE} $(patsubst %-job,%-dist,$@)
|
||||
## this would be easier but does not check whether a mode exists
|
||||
## --> tries to build it if there is no model
|
||||
|
||||
# tatoeba-group2eng-dist: ${GROUP2ENG_EVAL} ${GROUP2ENG_EVALALL}
|
||||
# ${MAKE} ${GROUP2ENG_DIST}
|
||||
|
||||
|
||||
${GROUP2ENG_TRAIN}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-train,%,$@)-eng \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-train,%,$@)}" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-train,%,$@)})" \
|
||||
TRGLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
|
||||
tatoeba-multilingual-train
|
||||
|
||||
${GROUP2ENG_EVAL}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-eval,%,$@)-eng \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-eval,%,$@)}" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-eval,%,$@)})" \
|
||||
TRGLANGS=eng \
|
||||
MODELTYPE=transformer \
|
||||
${TATOEBA_PARAMS} \
|
||||
@ -195,13 +192,13 @@ ${GROUP2ENG_EVAL}:
|
||||
|
||||
${GROUP2ENG_EVALALL}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-evalall,%,$@)-eng \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-evalall,%,$@)}" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-evalall,%,$@)})" \
|
||||
TRGLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
|
||||
tatoeba-multilingual-eval
|
||||
|
||||
${GROUP2ENG_DIST}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-dist,%,$@)-eng \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-dist,%,$@)}" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-dist,%,$@)})" \
|
||||
TRGLANGS=eng \
|
||||
MODELTYPE=transformer \
|
||||
${TATOEBA_PARAMS} \
|
||||
@ -210,15 +207,13 @@ ${GROUP2ENG_DIST}:
|
||||
|
||||
#### English to language group
|
||||
|
||||
ENG2GROUP_JOB = $(patsubst %,tatoeba-eng2%-job,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_TRAIN = $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_EVAL = $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_EVALALL = $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_DIST = $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_EVAL := $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_EVALALL := $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_DIST := $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
|
||||
|
||||
tatoeba-eng2group: ${ENG2GROUP_TRAIN}
|
||||
|
||||
tatoeba-eng2group: ${ENG2GROUP_JOB}
|
||||
# tatoeba-eng2group-dist: ${ENG2GROUP_EVAL} ${ENG2GROUP_EVALALL}
|
||||
# ${MAKE} ${ENG2GROUP_DIST}
|
||||
tatoeba-eng2group-dist:
|
||||
for g in ${OPUS_LANG_GROUPS}; do \
|
||||
if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
|
||||
@ -228,45 +223,34 @@ tatoeba-eng2group-dist:
|
||||
fi \
|
||||
done
|
||||
|
||||
tatoeba-eng2group-dist2:
|
||||
for g in ${OPUS_LANG_GROUPS}; do \
|
||||
if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
|
||||
mv models-tatoeba/eng-$$g models-tatoeba/eng-$$g-old2; \
|
||||
${MAKE} tatoeba-eng2$${g}-dist; \
|
||||
fi \
|
||||
done
|
||||
# tatoeba-eng2group-dist: ${ENG2GROUP_EVAL} ${ENG2GROUP_EVALALL}
|
||||
# ${MAKE} ${ENG2GROUP_DIST}
|
||||
|
||||
|
||||
${ENG2GROUP_JOB}:
|
||||
${MAKE} $(patsubst %-job,%-train,$@)
|
||||
${MAKE} $(patsubst %-job,%-eval,$@)
|
||||
${MAKE} $(patsubst %-job,%-evalall,$@)
|
||||
${MAKE} $(patsubst %-job,%-dist,$@)
|
||||
|
||||
${ENG2GROUP_TRAIN}:
|
||||
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-train,%,$@) \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-train,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-train,%,$@)})" \
|
||||
SRCLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
|
||||
tatoeba-multilingual-train
|
||||
|
||||
${ENG2GROUP_EVAL}:
|
||||
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-eval,%,$@) \
|
||||
SRCLANGS=eng \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-eval,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-eval,%,$@)})" \
|
||||
MODELTYPE=transformer \
|
||||
${TATOEBA_PARAMS} \
|
||||
compare
|
||||
|
||||
${ENG2GROUP_EVALALL}:
|
||||
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-evalall,%,$@) \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-evalall,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-evalall,%,$@)})" \
|
||||
SRCLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
|
||||
tatoeba-multilingual-eval
|
||||
|
||||
${ENG2GROUP_DIST}:
|
||||
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-dist,%,$@) \
|
||||
SRCLANGS=eng \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-dist,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-dist,%,$@)})" \
|
||||
MODELTYPE=transformer \
|
||||
${TATOEBA_PARAMS} \
|
||||
best-dist
|
||||
@ -275,15 +259,13 @@ ${ENG2GROUP_DIST}:
|
||||
|
||||
#### multilingual language-group (bi-directional
|
||||
|
||||
LANGGROUP_JOB = $(patsubst %,tatoeba-%-job,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_TRAIN = $(patsubst %,tatoeba-%-train,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_EVAL = $(patsubst %,tatoeba-%-eval,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_EVALALL = $(patsubst %,tatoeba-%-evalall,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_DIST = $(patsubst %,tatoeba-%-dist,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_TRAIN := $(patsubst %,tatoeba-%-train,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_EVAL := $(patsubst %,tatoeba-%-eval,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_EVALALL := $(patsubst %,tatoeba-%-evalall,${OPUS_LANG_GROUPS})
|
||||
LANGGROUP_DIST := $(patsubst %,tatoeba-%-dist,${OPUS_LANG_GROUPS})
|
||||
|
||||
tatoeba-langgroup: ${LANGGROUP_TRAIN}
|
||||
|
||||
tatoeba-langgroup: ${LANGGROUP_JOB}
|
||||
# tatoeba-langgroup-dist: ${LANGGROUP_EVAL} ${LANGGROUP_EVALALL}
|
||||
# ${MAKE} ${LANGGROUP_DIST}
|
||||
tatoeba-langgroup-dist:
|
||||
for g in ${OPUS_LANG_GROUPS}; do \
|
||||
if [ `find ${TATOEBA_WORK}/$$g-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
|
||||
@ -293,40 +275,36 @@ tatoeba-langgroup-dist:
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
${LANGGROUP_JOB}:
|
||||
${MAKE} $(patsubst %-job,%-train,$@)
|
||||
${MAKE} $(patsubst %-job,%-eval,$@)
|
||||
${MAKE} $(patsubst %-job,%-evalall,$@)
|
||||
${MAKE} $(patsubst %-job,%-dist,$@)
|
||||
# tatoeba-langgroup-dist: ${LANGGROUP_EVAL} ${LANGGROUP_EVALALL}
|
||||
# ${MAKE} ${LANGGROUP_DIST}
|
||||
|
||||
|
||||
${LANGGROUP_TRAIN}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-train,%,$@) \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)}" \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
|
||||
tatoeba-multilingual-train
|
||||
|
||||
${LANGGROUP_EVAL}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-eval,%,$@) \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-eval,%,$@)}" \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-eval,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
MODELTYPE=transformer \
|
||||
${TATOEBA_PARAMS} \
|
||||
compare
|
||||
|
||||
${LANGGROUP_EVALALL}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-evalall,%,$@) \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-evalall,%,$@)}" \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-evalall,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
|
||||
tatoeba-multilingual-eval
|
||||
|
||||
${LANGGROUP_DIST}:
|
||||
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-dist,%,$@) \
|
||||
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-dist,%,$@)}" \
|
||||
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-dist,%,$@)}" \
|
||||
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
|
||||
MODELTYPE=transformer \
|
||||
${TATOEBA_PARAMS} \
|
||||
best-dist
|
||||
@ -624,22 +602,7 @@ tatoeba-multilingual-testsets:
|
||||
## generic target for tatoeba challenge jobs
|
||||
%-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
|
||||
${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
|
||||
${MAKE} TRAINSET=Tatoeba-train \
|
||||
DEVSET=Tatoeba-dev \
|
||||
TESTSET=Tatoeba-test \
|
||||
TESTSET_NAME=Tatoeba-test \
|
||||
SMALLEST_TRAINSIZE=1000 \
|
||||
USE_REST_DEVDATA=0 \
|
||||
HELDOUTSIZE=0 \
|
||||
DEVSIZE=5000 \
|
||||
TESTSIZE=10000 \
|
||||
DEVMINSIZE=200 \
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
MODELSHOME=${PWD}/models-tatoeba \
|
||||
MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
|
||||
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
|
||||
ALT_MODEL_DIR=tatoeba \
|
||||
SKIP_DATA_DETAILS=1 \
|
||||
${MAKE} ${TATOEBA_PARAMS} \
|
||||
LANGPAIRSTR=${LANGPAIRSTR} \
|
||||
SRCLANGS="${shell cat ${word 1,$^} | sed 's/ *$$//;s/^ *//'}" \
|
||||
TRGLANGS="${shell cat ${word 2,$^} | sed 's/ *$$//;s/^ *//'}" \
|
||||
|
Loading…
Reference in New Issue
Block a user