back to yml vocab files as default

This commit is contained in:
Joerg Tiedemann 2020-09-25 09:58:25 +03:00
parent f9a44bdb99
commit c6356d3a8a
7 changed files with 95 additions and 66 deletions

View File

@ -34,3 +34,7 @@
* focus languages: Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
# Other requests
* Hebrew-->English and Hebrew-->Russian (Shaul Dar)

View File

@ -91,9 +91,8 @@ The variables can be set to override defaults. See below to understand how the d
Data sets, vocabulary, alignments and segmentation models will be stored in the work directory of the model (`work/LANGPAIRSTR/`). Here is an example for the language pair br-en:
```
# MarianNMT vocabulary files:
work/br-en/opus.spm4k-spm4k.src.vocab
work/br-en/opus.spm4k-spm4k.trg.vocab
# MarianNMT vocabulary file:
work/br-en/opus.spm4k-spm4k.vocab.yml
# test data:
work/br-en/test/README.md
@ -166,7 +165,7 @@ Currently, the makefile looks at the local copy of released OPUS data to find av
Most settings can be adjusted by setting corresponding variables to new values. Common changes are:
* run word-alignment and train with guided alignment: set `MODELTYPE=transformer-align`
* generate the vocabulary file from training data instead of using the sentence piece model: `USE_SPM_VOCAB=0`
* generate the vocabulary files directly from the sentence piece model: `USE_SPM_VOCAB=1`
* change the vocabulary size: set `BPESIZE=<yourvalue>` for example BPESIZE=4000 (this is also used for sentence-piece models)
* vocabulary sizes can also be set for source and target language independently (`SRCBPESIZE` and `TRGBPESIZE`)
* use BPE instead of sentence-piece (not recommended): set `SUBWORDS=bpe`

View File

@ -20,7 +20,7 @@ make SRCLANGS=en TRGLANGS=br config
make SRCLANGS=en TRGLANGS=br data
```
This will also download the necessary files if they don't exist on the local file system. It will train sentence piece models for each language separately and apply the model to all data sets. Finally, it also creates the vocabulary files from the sentence-piece models.
This will also download the necessary files if they don't exist on the local file system. It will train sentence piece models for each language separately and apply the model to all data sets. Finally, it also creates the vocabulary file from the training data.
## Train the model
@ -122,8 +122,7 @@ backtranslate/br-en/opus-2020-09-04/source.spm
backtranslate/br-en/opus-2020-09-04/target.spm
backtranslate/br-en/opus-2020-09-04/preprocess.sh
backtranslate/br-en/opus-2020-09-04/postprocess.sh
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.src.vocab
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.trg.vocab
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.vocab.yml
backtranslate/br-en/opus-2020-09-04/opus.spm4k-spm4k.transformer.model1.npz.best-perplexity.npz
```
@ -198,7 +197,7 @@ make SRCLANGS=en TRGLANGS=br data-bt
make SRCLANGS=en TRGLANGS=br train-bt
```
Those commands will re-use existing sentence-piece models, vocabulary files and will initialize the model with the one trained on OPUS data without back-translations. The early stopping settings are increased to 15 iterations without improvement. The model name will now be changed into `opus+bt` and logs will be stored `br-en/opus+bt.spm4k-spm4k.transformer.train1.log` with validation scores in `br-en/opus+bt.spm4k-spm4k.transformer.valid1.log`
Those commands will re-use existing sentence-piece models, vocabulary file and will initialize the model with the one trained on OPUS data without back-translations. The early stopping settings are increased to 15 iterations without improvement. The model name will now be changed into `opus+bt` and logs will be stored `br-en/opus+bt.spm4k-spm4k.transformer.train1.log` with validation scores in `br-en/opus+bt.spm4k-spm4k.transformer.valid1.log`
Evaluation can be done in the same way:

View File

@ -347,11 +347,14 @@ MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
## for others: extract vocabulary from training data with MarianNMT
## backwards compatibility: if there is already a vocab-file then use it
ifeq (${SUBWORDS},spm)
ifeq ($(wildcard ${WORKDIR}/${MODEL}.vocab.yml),)
USE_SPM_VOCAB ?= 1
endif
endif
# ifeq (${SUBWORDS},spm)
# ifeq ($(wildcard ${WORKDIR}/${MODEL}.vocab.yml),)
# USE_SPM_VOCAB ?= 1
# endif
# endif
## use vocab from sentence piece instead of
## marian_vocab from training data
ifeq ($(USE_SPM_VOCAB),1)
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab

View File

@ -362,8 +362,8 @@ ifdef CHECK_TRAINDATA_SIZE
echo ${CLEAN_TRAIN_TRG}; \
fi
endif
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
@echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
@for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
l=`${GZIP} -cd < $$d ${CUT_DATA_SETS} | wc -l`; \
if [ $$l -gt 0 ]; then \
echo "$$d" | xargs basename | \
@ -378,45 +378,47 @@ endif
# do we need to add target language labels?
######################################
ifeq (${USE_TARGET_LABELS},1)
echo "set target language labels";
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
@echo "set target language labels";
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
else
echo "only one target language"
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
@echo "only one target language"
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
endif
${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
@${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
######################################
# SHUFFLE_DATA is set?
# --> shuffle data for each langpair
# --> do this when FIT_DATA_SIZE is set!
######################################
ifdef SHUFFLE_DATA
paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
@echo "shuffle training data"
@paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
rm -f ${LOCAL_TRAIN_SRC}.shuffled
@cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
@cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
@rm -f ${LOCAL_TRAIN_SRC}.shuffled
endif
######################################
# FIT_DATA_SIZE is set?
# --> fit data to speciic size
# --> under/over sampling!
######################################
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
@echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifdef FIT_DATA_SIZE
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
@echo "sample data to fit size = ${FIT_DATA_SIZE}"
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
${LOCAL_TRAIN_SRC}.${LANGPAIR}.src | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
else
cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
@cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
@cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
@cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
endif
rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
@rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
@ -468,14 +470,16 @@ ${DEV_SRC}: %: %.shuffled.gz
## ---> make sure that we do not have any overlap between the two data sets
## ---> reserve at least DEVMINSIZE data for dev data and keep the rest for testing
ifeq (${DEVSET},${TESTSET})
if (( `${GZIP} -cd < $< | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
@if (( `${GZIP} -cd < $< | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
if (( `${GZIP} -cd < $< | wc -l` < $$((${DEVSMALLSIZE} + ${DEVMINSIZE})) )); then \
echo "extract ${DEVMINSIZE} examples from ${DEVSET} for dev and test"; \
${GZIP} -cd < $< | cut -f1 | head -${DEVMINSIZE} > ${DEV_SRC}; \
${GZIP} -cd < $< | cut -f2 | head -${DEVMINSIZE} > ${DEV_TRG}; \
mkdir -p ${dir ${TEST_SRC}}; \
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_TRG}; \
else \
echo "extract ${DEVSMALLSIZE} examples from ${DEVSET} for dev and test"; \
${GZIP} -cd < $< | cut -f1 | head -${DEVSMALLSIZE} > ${DEV_SRC}; \
${GZIP} -cd < $< | cut -f2 | head -${DEVSMALLSIZE} > ${DEV_TRG}; \
mkdir -p ${dir ${TEST_SRC}}; \
@ -483,34 +487,37 @@ ifeq (${DEVSET},${TESTSET})
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSMALLSIZE} + 1)) > ${TEST_TRG}; \
fi; \
else \
echo "extract ${DEVSIZE} examples from ${DEVSET} for dev"; \
echo "extract ${TESTSIZE} examples from ${DEVSET} for test"; \
${GZIP} -cd < $< | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}; \
${GZIP} -cd < $< | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}; \
mkdir -p ${dir ${TEST_SRC}}; \
${GZIP} -cd < $< | cut -f1 | head -$$((${DEVSIZE} + ${TESTSIZE})) | tail -${TESTSIZE} > ${TEST_SRC}; \
${GZIP} -cd < $< | cut -f2 | head -$$((${DEVSIZE} + ${TESTSIZE})) | tail -${TESTSIZE} > ${TEST_TRG}; \
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + ${TESTSIZE})) | ${GZIP} -c > ${DEV_SRC}.notused.gz; \
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + ${TESTSIZE})) | ${GZIP} -c > ${DEV_TRG}.notused.gz; \
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + ${TESTSIZE} + 1)) | ${GZIP} -c > ${DEV_SRC}.notused.gz; \
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + ${TESTSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz; \
fi
else
${GZIP} -cd < $< | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}
${GZIP} -cd < $< | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}
${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_SRC}.notused.gz
${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz
@echo "extract ${DEVSIZE} examples from ${DEVSET} for dev"
@${GZIP} -cd < $< | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}
@${GZIP} -cd < $< | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}
@${GZIP} -cd < $< | cut -f1 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_SRC}.notused.gz
@${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz
endif
echo "" >> ${dir ${DEV_SRC}}/README.md
echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo "" >> ${dir ${DEV_SRC}}/README.md
@echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
ifeq (${DEVSET},${TESTSET})
echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
echo "# Test data" > ${dir ${TEST_SRC}}/README.md
echo "" >> ${dir ${TEST_SRC}}/README.md
echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
@echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
@echo "" >> ${dir ${TEST_SRC}}/README.md
@echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
endif
@ -519,18 +526,19 @@ ${DEV_TRG}: ${DEV_SRC}
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
mkdir -p ${dir ${DEV_SRC}}
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
${ZCAT} ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
@echo "add to devset: ${CLEAN_DEV_SRC}"
@mkdir -p ${dir ${DEV_SRC}}
@echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
@${ZCAT} ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
${ZCAT} ${CLEAN_DEV_SRC} |\
@echo "more than one target language";
@${ZCAT} ${CLEAN_DEV_SRC} |\
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
else
echo "only one target language"
${ZCAT} ${CLEAN_DEV_SRC} >> ${DEV_SRC}
@echo "only one target language"
@${ZCAT} ${CLEAN_DEV_SRC} >> ${DEV_SRC}
endif
${ZCAT} ${CLEAN_DEV_TRG} >> ${DEV_TRG}
@${ZCAT} ${CLEAN_DEV_TRG} >> ${DEV_TRG}
####################
@ -590,16 +598,17 @@ ${TEST_TRG}: ${TEST_SRC}
@echo "done!"
add-to-test-data: ${CLEAN_TEST_SRC}
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
@echo "add to testset: ${CLEAN_TEST_SRC}"
@echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
${ZCAT} ${CLEAN_TEST_SRC} |\
@echo "more than one target language";
@${ZCAT} ${CLEAN_TEST_SRC} |\
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
else
echo "only one target language"
${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}
@echo "only one target language"
@${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}
endif
${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}
@${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}
@ -614,6 +623,10 @@ endif
## monolingual data: for language-specific sentence piece models
## that are independent of bitexts
## TODO: do we use this?
${LOCAL_MONO_DATA}.raw:
mkdir -p ${dir $@}
rm -f $@

View File

@ -102,6 +102,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
LOADMODS = ${LOADGPU}
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
CSCPROJECT = project_2002688
# CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
APPLHOME = /projappl/project_2001194
OPUSHOME = /projappl/nlpl/data/OPUS

View File

@ -7,6 +7,16 @@ MEMAD_LANGS = de en fi fr nl sv
# models for the MeMAD project
#-------------------------------------------------------------------
# FIT_DATA_SIZE=2000000
memad-multi-subs:
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" \
SKIP_LANGPAIRS="de-de|en-en|fi-fi|fr-fr|nl-nl|sv-sv" \
DEVSET=OpenSubtitles TRAINSET= MODELTYPE=transformer data
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" \
SKIP_LANGPAIRS="de-de|en-en|fi-fi|fr-fr|nl-nl|sv-sv" \
DEVSET=OpenSubtitles TRAINSET= MODELTYPE=transformer \
WALLTIME=72 HPC_MEM=8g HPC_CORES=1 train.submit-multigpu
memad-multi-train:
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" MODELTYPE=transformer data