add local config parameters

This commit is contained in:
Joerg Tiedemann 2020-04-18 21:40:52 +03:00
parent 294175f0fe
commit 58f042d127
4 changed files with 156 additions and 0 deletions

View File

@ -335,3 +335,69 @@ else
SEED=1234
endif
## load model-specific configuration parameters
ifneq ($(wildcard ${WORKDIR}/config),)
include ${WORKDIR}/config
endif
## make some data size-specific configuration parameters
local-config: ${WORKDIR}/config
${WORKDIR}/config:
mkdir -p ${dir $@}
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
else \
${MAKE} ${LOCAL_TRAIN_SRC}; \
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
fi; \
if [ $$s -gt 10000000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 10 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = -multipu" >> $@; \
elif [ $$s -gt 1000000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 1 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
elif [ $$s -gt 500000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 500k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "MARIAN_WORKSPACE = 10000" >> $@; \
echo "BPESIZE = 12000" >> $@; \
elif [ $$s -gt 100000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
echo "MARIAN_WORKSPACE = 5000" >> $@; \
echo "MARIAN_VALID_MINI_BATCH = 8" >> $@; \
echo "HELDOUTSIZE = 0" >> $@; \
echo "BPESIZE = 4000" >> $@; \
echo "DEVSIZE = 1000" >> $@; \
echo "TESTSIZE = 1000" >> $@; \
echo "DEVMINSIZE = 250" >> $@; \
elif [ $$s -gt 10000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 10k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
echo "MARIAN_WORKSPACE = 3500" >> $@; \
echo "MARIAN_DROPOUT = 0.5" >> $@; \
echo "MARIAN_VALID_MINI_BATCH = 4" >> $@; \
echo "HELDOUTSIZE = 0" >> $@; \
echo "BPESIZE = 500" >> $@; \
echo "DEVSIZE = 500" >> $@; \
echo "TESTSIZE = 1000" >> $@; \
echo "DEVMINSIZE = 100" >> $@; \
else \
echo "${LANGPAIRSTR} too small"; \
fi

View File

@ -183,11 +183,49 @@ all2en:
${MAKE} PIVOT=en allopus2pivot
#-------------------------------------------------------------------
# wikimedia tasks
#-------------------------------------------------------------------
as-en:
${MAKE} data-as-en
${MAKE} train-dynamic-as-en
${MAKE} reverse-data-as-en
${MAKE} train-dynamic-en-as
%-as-en:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=4000 \
SRCLANGS="as" TRGLANGS="en" \
${@:-as-en=}
%-en-as:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=4000 \
SRCLANGS="en" TRGLANGS="as" \
${@:-en-as=}
#-------------------------------------------------------------------
# important secondary langs in Finland
#-------------------------------------------------------------------
fi-so:
${MAKE} data-fi-so
${MAKE} train-dynamic-fi-so
${MAKE} reverse-data-fi-so
${MAKE} train-dynamic-so-fi
%-fi-so:
${MAKE} HELDOUTSIZE=0 BPESIZE=4000 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 \
SRCLANGS=fi TRGLANGS=so data \
${@:-fi-so=}
%-so-fi:
${MAKE} HELDOUTSIZE=0 BPESIZE=4000 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 \
SRCLANGS=so TRGLANGS=fi data \
${@:-so-fi=}
fi-xx:
for l in ru et ar so ku fa sq vi th tr es pl; do \
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=fi \

View File

@ -2,6 +2,13 @@
# Things to do
## General settings
* better hyperparameters for low-resource setting (lower batch sizes, smaller vocabularies ...)
* better data selection (data cleaning / filtering); use opus-filter?
* better balance between general data sets and backtranslations
## Backtranslation
* status: basically working, need better integration?!

View File

@ -193,6 +193,10 @@ breton:
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis
assamese:
${MAKE} SRC=as TRG=en MODELHOME=${HOME}/research/Opus-MT-train/work/models/as-en all-wikis
@ -306,6 +310,47 @@ translate-sami-wiki:
### NEWNEWNEW
translate-sami-xx-wiki:
for s in se; do \
for t in sma smn sms smj fi no sv; do \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
done \
done
translate-sami-xx-corp: sami-corp giellatekno/se
for s in se sma smn sms smj; do
for t in fi no sv; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRC=$$s TRG=$$t \
WIKI_DIR=giellatekno/$$s \
WIKISOURCE=corp \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
fi \
done \
done
translate-xx-sami-wiki:
for s in fi no nn sv; do \
for t in se sma smn sms smj; do \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
done \
done