mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 12:32:24 +03:00
back to yml vocab files as default
This commit is contained in:
parent
f9a44bdb99
commit
c6356d3a8a
4
TODO.md
4
TODO.md
@ -34,3 +34,7 @@
|
||||
* focus languages: Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
|
||||
|
||||
|
||||
|
||||
# Other requests
|
||||
|
||||
* Hebrew-->English and Hebrew-->Russian (Shaul Dar)
|
@ -91,9 +91,8 @@ The variables can be set to override defaults. See below to understand how the d
|
||||
Data sets, vocabulary, alignments and segmentation models will be stored in the work directory of the model (`work/LANGPAIRSTR/`). Here is an example for the language pair br-en:
|
||||
|
||||
```
|
||||
# MarianNMT vocabulary files:
|
||||
work/br-en/opus.spm4k-spm4k.src.vocab
|
||||
work/br-en/opus.spm4k-spm4k.trg.vocab
|
||||
# MarianNMT vocabulary file:
|
||||
work/br-en/opus.spm4k-spm4k.vocab.yml
|
||||
|
||||
# test data:
|
||||
work/br-en/test/README.md
|
||||
@ -166,7 +165,7 @@ Currently, the makefile looks at the local copy of released OPUS data to find av
|
||||
Most settings can be adjusted by setting corresponding variables to new values. Common changes are:
|
||||
|
||||
* run word-alignment and train with guided alignment: set `MODELTYPE=transformer-align`
|
||||
* generate the vocabulary file from training data instead of using the sentence piece model: `USE_SPM_VOCAB=0`
|
||||
* generate the vocabulary files directly from the sentence piece model: `USE_SPM_VOCAB=1`
|
||||
* change the vocabulary size: set `BPESIZE=<yourvalue>` for example BPESIZE=4000 (this is also used for sentence-piece models)
|
||||
* vocabulary sizes can also be set for source and target language independently (`SRCBPESIZE` and `TRGBPESIZE`)
|
||||
* use BPE instead of sentence-piece (not recommended): set `SUBWORDS=bpe`
|
||||
|
@ -20,7 +20,7 @@ make SRCLANGS=en TRGLANGS=br config
|
||||
make SRCLANGS=en TRGLANGS=br data
|
||||
```
|
||||
|
||||
This will also download the necessary files if they don't exist on the local file system. It will train sentence piece models for each language separately and apply the model to all data sets. Finally, it also creates the vocabulary files from the sentence-piece models.
|
||||
This will also download the necessary files if they don't exist on the local file system. It will train sentence piece models for each language separately and apply the model to all data sets. Finally, it also creates the vocabulary file from the training data.
|
||||
|
||||
|
||||
## Train the model
|
||||
@ -122,8 +122,7 @@ backtranslate/br-en/opus-2020-09-04/source.spm
|
||||
backtranslate/br-en/opus-2020-09-04/target.spm
|
||||
backtranslate/br-en/opus-2020-09-04/preprocess.sh
|
||||
backtranslate/br-en/opus-2020-09-04/postprocess.sh
|
||||
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.src.vocab
|
||||
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.trg.vocab
|
||||
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.vocab.yml
|
||||
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.transformer.model1.npz.best-perplexity.npz
|
||||
```
|
||||
|
||||
@ -198,7 +197,7 @@ make SRCLANGS=en TRGLANGS=br data-bt
|
||||
make SRCLANGS=en TRGLANGS=br train-bt
|
||||
```
|
||||
|
||||
Those commands will re-use existing sentence-piece models, vocabulary files and will initialize the model with the one trained on OPUS data without back-translations. The early stopping settings are increased to 15 iterations without improvement. The model name will now be changed into `opus+bt` and logs will be stored `br-en/opus+bt.spm4k-spm4k.transformer.train1.log` with validation scores in `br-en/opus+bt.spm4k-spm4k.transformer.valid1.log`
|
||||
Those commands will re-use existing sentence-piece models, vocabulary file and will initialize the model with the one trained on OPUS data without back-translations. The early stopping settings are increased to 15 iterations without improvement. The model name will now be changed into `opus+bt` and logs will be stored `br-en/opus+bt.spm4k-spm4k.transformer.train1.log` with validation scores in `br-en/opus+bt.spm4k-spm4k.transformer.valid1.log`
|
||||
|
||||
Evaluation can be done in the same way:
|
||||
|
||||
|
@ -347,11 +347,14 @@ MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
## for others: extract vocabulary from training data with MarianNMT
|
||||
## backwards compatibility: if there is already a vocab-file then use it
|
||||
|
||||
ifeq (${SUBWORDS},spm)
|
||||
ifeq ($(wildcard ${WORKDIR}/${MODEL}.vocab.yml),)
|
||||
USE_SPM_VOCAB ?= 1
|
||||
endif
|
||||
endif
|
||||
# ifeq (${SUBWORDS},spm)
|
||||
# ifeq ($(wildcard ${WORKDIR}/${MODEL}.vocab.yml),)
|
||||
# USE_SPM_VOCAB ?= 1
|
||||
# endif
|
||||
# endif
|
||||
|
||||
## use vocab from sentence piece instead of
|
||||
## marian_vocab from training data
|
||||
|
||||
ifeq ($(USE_SPM_VOCAB),1)
|
||||
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab
|
||||
|
119
lib/data.mk
119
lib/data.mk
@ -362,8 +362,8 @@ ifdef CHECK_TRAINDATA_SIZE
|
||||
echo ${CLEAN_TRAIN_TRG}; \
|
||||
fi
|
||||
endif
|
||||
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
|
||||
@echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
@for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
|
||||
l=`${GZIP} -cd < $$d ${CUT_DATA_SETS} | wc -l`; \
|
||||
if [ $$l -gt 0 ]; then \
|
||||
echo "$$d" | xargs basename | \
|
||||
@ -378,45 +378,47 @@ endif
|
||||
# do we need to add target language labels?
|
||||
######################################
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "set target language labels";
|
||||
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
|
||||
@echo "set target language labels";
|
||||
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
|
||||
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
else
|
||||
echo "only one target language"
|
||||
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
@echo "only one target language"
|
||||
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
endif
|
||||
${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
@${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
######################################
|
||||
# SHUFFLE_DATA is set?
|
||||
# --> shuffle data for each langpair
|
||||
# --> do this when FIT_DATA_SIZE is set!
|
||||
######################################
|
||||
ifdef SHUFFLE_DATA
|
||||
paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
|
||||
@echo "shuffle training data"
|
||||
@paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
|
||||
${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
|
||||
cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
rm -f ${LOCAL_TRAIN_SRC}.shuffled
|
||||
@cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
@cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
@rm -f ${LOCAL_TRAIN_SRC}.shuffled
|
||||
endif
|
||||
######################################
|
||||
# FIT_DATA_SIZE is set?
|
||||
# --> fit data to speciic size
|
||||
# --> under/over sampling!
|
||||
######################################
|
||||
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
@echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
ifdef FIT_DATA_SIZE
|
||||
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
|
||||
@echo "sample data to fit size = ${FIT_DATA_SIZE}"
|
||||
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
|
||||
${LOCAL_TRAIN_SRC}.${LANGPAIR}.src | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
|
||||
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
|
||||
${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
|
||||
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
|
||||
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
|
||||
${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
|
||||
else
|
||||
cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
|
||||
cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
|
||||
@cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
@cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
|
||||
@cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
|
||||
endif
|
||||
rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
@rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
|
||||
|
||||
|
||||
@ -468,14 +470,16 @@ ${DEV_SRC}: %: %.shuffled.gz
|
||||
## ---> make sure that we do not have any overlap between the two data sets
|
||||
## ---> reserve at least DEVMINSIZE data for dev data and keep the rest for testing
|
||||
ifeq (${DEVSET},${TESTSET})
|
||||
if (( `${GZIP} -cd < $< | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
|
||||
@if (( `${GZIP} -cd < $< | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
|
||||
if (( `${GZIP} -cd < $< | wc -l` < $$((${DEVSMALLSIZE} + ${DEVMINSIZE})) )); then \
|
||||
echo "extract ${DEVMINSIZE} examples from ${DEVSET} for dev and test"; \
|
||||
${GZIP} -cd < $< | cut -f1 | head -${DEVMINSIZE} > ${DEV_SRC}; \
|
||||
${GZIP} -cd < $< | cut -f2 | head -${DEVMINSIZE} > ${DEV_TRG}; \
|
||||
mkdir -p ${dir ${TEST_SRC}}; \
|
||||
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
|
||||
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_TRG}; \
|
||||
else \
|
||||
echo "extract ${DEVSMALLSIZE} examples from ${DEVSET} for dev and test"; \
|
||||
${GZIP} -cd < $< | cut -f1 | head -${DEVSMALLSIZE} > ${DEV_SRC}; \
|
||||
${GZIP} -cd < $< | cut -f2 | head -${DEVSMALLSIZE} > ${DEV_TRG}; \
|
||||
mkdir -p ${dir ${TEST_SRC}}; \
|
||||
@ -483,34 +487,37 @@ ifeq (${DEVSET},${TESTSET})
|
||||
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSMALLSIZE} + 1)) > ${TEST_TRG}; \
|
||||
fi; \
|
||||
else \
|
||||
echo "extract ${DEVSIZE} examples from ${DEVSET} for dev"; \
|
||||
echo "extract ${TESTSIZE} examples from ${DEVSET} for test"; \
|
||||
${GZIP} -cd < $< | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}; \
|
||||
${GZIP} -cd < $< | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}; \
|
||||
mkdir -p ${dir ${TEST_SRC}}; \
|
||||
${GZIP} -cd < $< | cut -f1 | head -$$((${DEVSIZE} + ${TESTSIZE})) | tail -${TESTSIZE} > ${TEST_SRC}; \
|
||||
${GZIP} -cd < $< | cut -f2 | head -$$((${DEVSIZE} + ${TESTSIZE})) | tail -${TESTSIZE} > ${TEST_TRG}; \
|
||||
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + ${TESTSIZE})) | ${GZIP} -c > ${DEV_SRC}.notused.gz; \
|
||||
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + ${TESTSIZE})) | ${GZIP} -c > ${DEV_TRG}.notused.gz; \
|
||||
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + ${TESTSIZE} + 1)) | ${GZIP} -c > ${DEV_SRC}.notused.gz; \
|
||||
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + ${TESTSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz; \
|
||||
fi
|
||||
else
|
||||
${GZIP} -cd < $< | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}
|
||||
${GZIP} -cd < $< | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}
|
||||
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_SRC}.notused.gz
|
||||
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz
|
||||
@echo "extract ${DEVSIZE} examples from ${DEVSET} for dev"
|
||||
@${GZIP} -cd < $< | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}
|
||||
@${GZIP} -cd < $< | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}
|
||||
@${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_SRC}.notused.gz
|
||||
@${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz
|
||||
endif
|
||||
echo "" >> ${dir ${DEV_SRC}}/README.md
|
||||
echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
|
||||
wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
|
||||
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
ifeq (${DEVSET},${TESTSET})
|
||||
echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
|
||||
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
|
||||
echo "# Test data" > ${dir ${TEST_SRC}}/README.md
|
||||
echo "" >> ${dir ${TEST_SRC}}/README.md
|
||||
echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
|
||||
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
|
||||
echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
|
||||
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
|
||||
@echo "" >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
|
||||
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
|
||||
endif
|
||||
|
||||
|
||||
@ -519,18 +526,19 @@ ${DEV_TRG}: ${DEV_SRC}
|
||||
|
||||
|
||||
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
|
||||
mkdir -p ${dir ${DEV_SRC}}
|
||||
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
|
||||
${ZCAT} ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
@echo "add to devset: ${CLEAN_DEV_SRC}"
|
||||
@mkdir -p ${dir ${DEV_SRC}}
|
||||
@echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "more than one target language";
|
||||
${ZCAT} ${CLEAN_DEV_SRC} |\
|
||||
@echo "more than one target language";
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
|
||||
else
|
||||
echo "only one target language"
|
||||
${ZCAT} ${CLEAN_DEV_SRC} >> ${DEV_SRC}
|
||||
@echo "only one target language"
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} >> ${DEV_SRC}
|
||||
endif
|
||||
${ZCAT} ${CLEAN_DEV_TRG} >> ${DEV_TRG}
|
||||
@${ZCAT} ${CLEAN_DEV_TRG} >> ${DEV_TRG}
|
||||
|
||||
|
||||
####################
|
||||
@ -590,16 +598,17 @@ ${TEST_TRG}: ${TEST_SRC}
|
||||
@echo "done!"
|
||||
|
||||
add-to-test-data: ${CLEAN_TEST_SRC}
|
||||
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
|
||||
@echo "add to testset: ${CLEAN_TEST_SRC}"
|
||||
@echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "more than one target language";
|
||||
${ZCAT} ${CLEAN_TEST_SRC} |\
|
||||
@echo "more than one target language";
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
|
||||
else
|
||||
echo "only one target language"
|
||||
${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}
|
||||
@echo "only one target language"
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}
|
||||
endif
|
||||
${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}
|
||||
@${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}
|
||||
|
||||
|
||||
|
||||
@ -614,6 +623,10 @@ endif
|
||||
|
||||
|
||||
|
||||
## monolingual data: for language-specific sentence piece models
|
||||
## that are independent of bitexts
|
||||
## TODO: do we use this?
|
||||
|
||||
${LOCAL_MONO_DATA}.raw:
|
||||
mkdir -p ${dir $@}
|
||||
rm -f $@
|
||||
|
@ -102,6 +102,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
|
||||
LOADMODS = ${LOADGPU}
|
||||
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
CSCPROJECT = project_2002688
|
||||
# CSCPROJECT = project_2002982
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
APPLHOME = /projappl/project_2001194
|
||||
OPUSHOME = /projappl/nlpl/data/OPUS
|
||||
|
@ -7,6 +7,16 @@ MEMAD_LANGS = de en fi fr nl sv
|
||||
# models for the MeMAD project
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
# FIT_DATA_SIZE=2000000
|
||||
|
||||
memad-multi-subs:
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" \
|
||||
SKIP_LANGPAIRS="de-de|en-en|fi-fi|fr-fr|nl-nl|sv-sv" \
|
||||
DEVSET=OpenSubtitles TRAINSET= MODELTYPE=transformer data
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" \
|
||||
SKIP_LANGPAIRS="de-de|en-en|fi-fi|fr-fr|nl-nl|sv-sv" \
|
||||
DEVSET=OpenSubtitles TRAINSET= MODELTYPE=transformer \
|
||||
WALLTIME=72 HPC_MEM=8g HPC_CORES=1 train.submit-multigpu
|
||||
|
||||
memad-multi-train:
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" MODELTYPE=transformer data
|
||||
|
Loading…
Reference in New Issue
Block a user