more details in model config

This commit is contained in:
Joerg Tiedemann 2020-06-18 20:50:22 +03:00
parent 4e18da6e4c
commit b7f45e2a74
3 changed files with 77 additions and 17 deletions

View File

@ -433,18 +433,29 @@ ${WORKDIR}/config.mk:
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = -multigpu" >> $@; \
echo "BPESIZE = ${BPESIZE}" >> $@; \
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
echo "TESTSIZE = ${TESTSIZE}" >> $@; \
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 1 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "BPESIZE = ${BPESIZE}" >> $@; \
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
echo "TESTSIZE = ${TESTSIZE}" >> $@; \
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
elif [ $$s -gt ${MEDIUM_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 500k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "MARIAN_WORKSPACE = 10000" >> $@; \
echo "BPESIZE = 12000" >> $@; \
echo "BPESIZE = 12000" >> $@; \
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
echo "TESTSIZE = ${TESTSIZE}" >> $@; \
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
elif [ $$s -gt ${SMALL_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
@ -479,5 +490,25 @@ ${WORKDIR}/config.mk:
echo "TRGBPESIZE = 32000" >> $@; \
fi; \
fi
echo "SRCLANGS = ${SRCLANGS}" >> $@
echo "TRGLANGS = ${TRGLANGS}" >> $@
echo "SKIPLANGS = ${SKIPLANGS}" >> $@
echo "LANGPAIRSTR = ${LANGPAIRSTR}" >> $@
echo "DATASET = ${DATASET}" >> $@
echo "TRAINSET = ${TRAINSET}" >> $@
echo "DEVSET = ${DEVSET}" >> $@
echo "TESTSET = ${TESTSET}" >> $@
echo "PRE = ${PRE}" >> $@
echo "PRE_SRC = ${PRE_SRC}" >> $@
echo "PRE_TRG = ${PRE_TRG}" >> $@
ifdef SHUFFLE_DATA
echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@
endif
ifdef FIT_DATA_SIZE
echo "FIT_DATA_SIZE = ${FIT_DATA_SIZE}" >> $@
endif
echo "MAX_OVER_SAMPLING = ${MAX_OVER_SAMPLING}" >> $@
echo "USE_REST_DEVDATA = ${USE_REST_DEVDATA}" >> $@
ifdef USE_TARGET_LABELS
echo "USE_TARGET_LABELS = ${USE_TARGET_LABELS}" >> $@
endif

View File

@ -76,7 +76,8 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
else
# CSCPROJECT = project_2001194
# CSCPROJECT = project_2000309
CSCPROJECT = project_2002688
# CSCPROJECT = project_2002688
CSCPROJECT = project_2002982
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
WORKHOME = ${shell realpath ${PWD}/work-langid}
APPLHOME = ${HOME}/projappl

View File

@ -81,11 +81,13 @@ TATOEBA_MODEL_CONTAINER = Tatoeba-MT-models
## start unidirectional training job
## - make data first, then submit a job
.PHONY: tatoeba-job
tatoeba-job:
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
## start jobs in both translation directions
.PHONY: tatoeba-bidirectional-job
tatoeba-bidirectional-job:
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
@ -97,26 +99,37 @@ endif
## prepare data (config, train.dev.test data, labels)
.PHONY: tatoeba-prepare
tatoeba-prepare: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba
## train a model
.PHONY: tatoeba-train
tatoeba-train:
${MAKE} train-tatoeba
## evaluate a model
.PHONY: tatoeba-eval
tatoeba-eval:
${MAKE} compare-tatoeba
## fetch the essential data and get labels for language variants
## (this is done by the data targets above as well)
.PHONY: tatoeba-data tatoeba-labels
tatoeba-data: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
tatoeba-labels: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
.PHONY: tatoeba-results
tatoeba-results:
rm -f tatoeba-results* results/*.md
${MAKE} tatoeba-results-md
## create result tables in various variants and for various subsets
## markdown pages are for reading on-line in the Tatoeba Challenge git
## ---> link results dir to the local copy of the Tatoeba Challenge git
.PHONY: tatoeba-results-md
tatoeba-results-md: tatoeba-results-sorted tatoeba-results-sorted-model tatoeba-results-sorted-langpair \
results/tatoeba-results-sorted.md \
results/tatoeba-results-sorted-model.md \
@ -258,7 +271,7 @@ tatoeba-trainsize-%.txt: tatoeba-%.md
## generic targets for working with multilingual models
###############################################################################
.PHONY: tatoeba-multilingual-train
tatoeba-multilingual-train:
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
@ -273,6 +286,7 @@ tatoeba-multilingual-train:
## evaluate all individual language pairs for a multilingual model
.PHONY: tatoeba-multilingual-eval
tatoeba-multilingual-eval:
${MAKE} tatoeba-multilingual-testsets
for s in ${SRCLANGS}; do \
@ -298,6 +312,7 @@ tatoeba-multilingual-eval:
## copy testsets into the multilingual model's test directory
.PHONY: tatoeba-multilingual-testsets
tatoeba-multilingual-testsets:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
@ -419,14 +434,24 @@ ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
## modify language IDs in training data to adjust them to test sets
## --> fix codes for chinese and take away script information (not reliable!)
## except the distinction betwee traditional and simplified
## --> take away regional codes
## --> take away script extension that may come with some codes
FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/;s/zho\(.*\)_CN/cmn\1/;s/zho\(.*\)_TW/cmn\1/;' \
| sed 's/\_[A-Z][A-Z]//' \
| sed 's/\-[a-z]*//' \
| sed 's/jpn_[A-Za-z]*/jpn/' \
| sed 's/kor_[A-Za-z]*/kor/' \
| sed 's/\(cjy\|cmn\|gan\|lzh\|nan\|wuu\|yue\|zho\)_[A-Za-z]*/\1/' \
FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_TW/cmn\1/g;' \
| sed 's/\_[A-Z][A-Z]//g' \
| sed 's/\-[a-z]*//g' \
| sed 's/jpn_[A-Za-z]*/jpn/g' \
| sed 's/kor_[A-Za-z]*/kor/g' \
| perl -pe 'if (/(cjy|cmn|gan|lzh|nan|wuu|yue|zho)_([A-Za-z]{4})/){if ($$2 ne "Hans" && $$2 ne "Hant"){s/(cjy|cmn|gan|lzh|nan|wuu|yue|zho)_([A-Za-z]{4})/$$1/} }'
## assume that all zho is Mandarin Chinese?
# | sed 's/zho/cmn/g'
## take away all script info for Chinese? even tranditional vs simplified?
# | sed 's/\(cjy\|cmn\|gan\|lzh\|nan\|wuu\|yue\|zho\)_[A-Za-z]*/\1/'
## convert Tatoeba Challenge data into the format we need
@ -471,7 +496,7 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/;s/zho\(.*\)_CN/cmn\1/;s/zho\(.*\)_TW/
# - simplified vs traditional script
#
# TODO: should not manipulate test data like this!!!!
# ---> du Chinese script detectiont properl in data releases!
# ---> do Chinese script detectiont properl in data releases!
#######################################
ifeq ($(filter cjy cmn gan lzh nan wuu yue zho,${SRC}),${SRC})
@echo "treating source language Chinese"
@ -507,11 +532,19 @@ endif
# labels in the data
# TODO: should we take all in all data sets?
# NOW: only look for the ones in test data
# special treatment for Chinese:
# add the generic zho labels as well
#######################################
# cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
# cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
cut -f1 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
cut -f2 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
ifeq (${SRC},zho)
echo -n 'zho zho_Hans zho_Hant cmn' >> $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
endif
ifeq (${TRG},zho)
echo -n 'zho zho_Hans zho_Hant cmn' >> $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
endif
rm -f $@.d/data/${LANGPAIR}/*
rmdir $@.d/data/${LANGPAIR}
rmdir $@.d/data
@ -597,11 +630,6 @@ testsets/${LANGPAIR}/Tatoeba-test.${LANGPAIR}.%: ${TATOEBA_DATA}/Tatoeba-test.${
## generate result tables
###############################################################################
too-small-ref: tatoeba-results-BLEU-sorted-model
grep -P 'ref_len = 1?[0-9]?[0-9]\)' $< | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'
results/tatoeba-results%.md: tatoeba-results% tatoeba-results-BLEU-sorted-model
mkdir -p ${dir $@}
echo "# Tatoeba translation results" >$@