mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 22:14:14 +03:00
add local config parameters
This commit is contained in:
parent
294175f0fe
commit
58f042d127
@ -335,3 +335,69 @@ else
|
||||
SEED=1234
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## load model-specific configuration parameters
|
||||
|
||||
ifneq ($(wildcard ${WORKDIR}/config),)
|
||||
include ${WORKDIR}/config
|
||||
endif
|
||||
|
||||
|
||||
## make some data size-specific configuration parameters
|
||||
|
||||
local-config: ${WORKDIR}/config
|
||||
|
||||
${WORKDIR}/config:
|
||||
mkdir -p ${dir $@}
|
||||
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
|
||||
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
else \
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}; \
|
||||
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
|
||||
fi; \
|
||||
if [ $$s -gt 10000000 ]; then \
|
||||
echo "# ${LANGPAIRSTR} bigger than 10 million" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = -multipu" >> $@; \
|
||||
elif [ $$s -gt 1000000 ]; then \
|
||||
echo "# ${LANGPAIRSTR} bigger than 1 million" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
||||
elif [ $$s -gt 500000 ]; then \
|
||||
echo "# ${LANGPAIRSTR} bigger than 500k" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
||||
echo "MARIAN_WORKSPACE = 10000" >> $@; \
|
||||
echo "BPESIZE = 12000" >> $@; \
|
||||
elif [ $$s -gt 100000 ]; then \
|
||||
echo "# ${LANGPAIRSTR} bigger than 100k" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
|
||||
echo "MARIAN_WORKSPACE = 5000" >> $@; \
|
||||
echo "MARIAN_VALID_MINI_BATCH = 8" >> $@; \
|
||||
echo "HELDOUTSIZE = 0" >> $@; \
|
||||
echo "BPESIZE = 4000" >> $@; \
|
||||
echo "DEVSIZE = 1000" >> $@; \
|
||||
echo "TESTSIZE = 1000" >> $@; \
|
||||
echo "DEVMINSIZE = 250" >> $@; \
|
||||
elif [ $$s -gt 10000 ]; then \
|
||||
echo "# ${LANGPAIRSTR} bigger than 10k" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
|
||||
echo "MARIAN_WORKSPACE = 3500" >> $@; \
|
||||
echo "MARIAN_DROPOUT = 0.5" >> $@; \
|
||||
echo "MARIAN_VALID_MINI_BATCH = 4" >> $@; \
|
||||
echo "HELDOUTSIZE = 0" >> $@; \
|
||||
echo "BPESIZE = 500" >> $@; \
|
||||
echo "DEVSIZE = 500" >> $@; \
|
||||
echo "TESTSIZE = 1000" >> $@; \
|
||||
echo "DEVMINSIZE = 100" >> $@; \
|
||||
else \
|
||||
echo "${LANGPAIRSTR} too small"; \
|
||||
fi
|
||||
|
||||
|
@ -183,11 +183,49 @@ all2en:
|
||||
${MAKE} PIVOT=en allopus2pivot
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
# wikimedia tasks
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
as-en:
|
||||
${MAKE} data-as-en
|
||||
${MAKE} train-dynamic-as-en
|
||||
${MAKE} reverse-data-as-en
|
||||
${MAKE} train-dynamic-en-as
|
||||
|
||||
%-as-en:
|
||||
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=4000 \
|
||||
SRCLANGS="as" TRGLANGS="en" \
|
||||
${@:-as-en=}
|
||||
|
||||
%-en-as:
|
||||
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=4000 \
|
||||
SRCLANGS="en" TRGLANGS="as" \
|
||||
${@:-en-as=}
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
# important secondary langs in Finland
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
fi-so:
|
||||
${MAKE} data-fi-so
|
||||
${MAKE} train-dynamic-fi-so
|
||||
${MAKE} reverse-data-fi-so
|
||||
${MAKE} train-dynamic-so-fi
|
||||
|
||||
%-fi-so:
|
||||
${MAKE} HELDOUTSIZE=0 BPESIZE=4000 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 \
|
||||
SRCLANGS=fi TRGLANGS=so data \
|
||||
${@:-fi-so=}
|
||||
|
||||
%-so-fi:
|
||||
${MAKE} HELDOUTSIZE=0 BPESIZE=4000 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 \
|
||||
SRCLANGS=so TRGLANGS=fi data \
|
||||
${@:-so-fi=}
|
||||
|
||||
|
||||
fi-xx:
|
||||
for l in ru et ar so ku fa sq vi th tr es pl; do \
|
||||
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=fi \
|
||||
|
7
TODO.md
7
TODO.md
@ -2,6 +2,13 @@
|
||||
# Things to do
|
||||
|
||||
|
||||
## General settings
|
||||
|
||||
* better hyperparameters for low-resource setting (lower batch sizes, smaller vocabularies ...)
|
||||
* better data selection (data cleaning / filtering); use opus-filter?
|
||||
* better balance between general data sets and backtranslations
|
||||
|
||||
|
||||
## Backtranslation
|
||||
|
||||
* status: basically working, need better integration?!
|
||||
|
@ -193,6 +193,10 @@ breton:
|
||||
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis
|
||||
|
||||
|
||||
assamese:
|
||||
${MAKE} SRC=as TRG=en MODELHOME=${HOME}/research/Opus-MT-train/work/models/as-en all-wikis
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -306,6 +310,47 @@ translate-sami-wiki:
|
||||
|
||||
|
||||
|
||||
### NEWNEWNEW
|
||||
|
||||
translate-sami-xx-wiki:
|
||||
for s in se; do \
|
||||
for t in sma smn sms smj fi no sv; do \
|
||||
${MAKE} SRC=$$s TRG=$$t \
|
||||
MULTI_TARGET_MODEL=1 \
|
||||
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
|
||||
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
||||
translate.submit; \
|
||||
done \
|
||||
done
|
||||
|
||||
|
||||
translate-sami-xx-corp: sami-corp giellatekno/se
|
||||
for s in se sma smn sms smj; do
|
||||
for t in fi no sv; do \
|
||||
if [ "$$s" != "$$t" ]; then \
|
||||
${MAKE} SRC=$$s TRG=$$t \
|
||||
WIKI_DIR=giellatekno/$$s \
|
||||
WIKISOURCE=corp \
|
||||
MULTI_TARGET_MODEL=1 \
|
||||
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
|
||||
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
||||
translate.submit; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
translate-xx-sami-wiki:
|
||||
for s in fi no nn sv; do \
|
||||
for t in se sma smn sms smj; do \
|
||||
${MAKE} SRC=$$s TRG=$$t \
|
||||
MULTI_TARGET_MODEL=1 \
|
||||
MODELHOME=${HOME}/research/Opus-MT-train/models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms \
|
||||
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
||||
translate.submit; \
|
||||
done \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user