a bit more info added for data sets

This commit is contained in:
Joerg Tiedemann 2020-05-09 22:33:33 +03:00
parent c98cc9bf26
commit e4455e510a
6 changed files with 168 additions and 198 deletions

View File

@ -11,7 +11,8 @@
# make translate .......... translate test set
# make eval ............... evaluate
#
# make train-job .......... create data and submit training job
# make all-job ............ create config, data and submit training job
# make train-job .......... submit training job
#
#--------------------------------------------------------------------
# general parameters / variables (see lib/config.mk)
@ -197,11 +198,19 @@ all: ${WORKDIR}/config.mk
${MAKE} eval
${MAKE} compare
.PHONY: train-job
train-job: ${WORKDIR}/config.mk
.PHONY: all-job
all-job: ${WORKDIR}/config.mk
${MAKE} data
${MAKE} train-and-eval-job
.PHONY: train-job
train-job:
${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train.submit${GPUJOB_SUBMIT}
.PHONY: train-and-eval-job
train-and-eval-job:
${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train-and-eval.submit${GPUJOB_SUBMIT}
#------------------------------------------------------------------------
# make various data sets (and word alignment)
#------------------------------------------------------------------------
@ -235,6 +244,8 @@ wordalign: ${TRAIN_ALG}
## other model types
vocab: ${MODEL_VOCAB}
train: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
translate: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
eval: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.eval
compare: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare

View File

@ -145,25 +145,25 @@ endif
## in case we want to use some additional data sets
EXTRA_TRAINSET =
# EXTRA_TRAINSET =
## TESTSET= DEVSET, TRAINSET = OPUS - WMT-News,DEVSET.TESTSET
TESTSET = ${DEVSET}
TRAINSET = $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TUNESET = OpenSubtitles
MONOSET = $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
TESTSET ?= ${DEVSET}
TRAINSET ?= $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TUNESET ?= OpenSubtitles
MONOSET ?= $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
## 1 = use remaining data from dev/test data for training
USE_REST_DEVDATA = 1
USE_REST_DEVDATA ?= 1
##----------------------------------------------------------------------------
## pre-processing and vocabulary
##----------------------------------------------------------------------------
BPESIZE = 32000
SRCBPESIZE = ${BPESIZE}
TRGBPESIZE = ${BPESIZE}
BPESIZE ?= 32000
SRCBPESIZE ?= ${BPESIZE}
TRGBPESIZE ?= ${BPESIZE}
VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
@ -225,15 +225,32 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg
DEV_SRC = ${WORKDIR}/val/${DEVSET}.src
DEV_TRG = ${WORKDIR}/val/${DEVSET}.trg
## dev and test data come from one specific data set
## if we have a bilingual model
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
DEV_SRC = ${WORKDIR}/val/${DEVSET}.src
DEV_TRG = ${WORKDIR}/val/${DEVSET}.trg
TEST_SRC = ${WORKDIR}/test/${TESTSET}.src
TEST_TRG = ${WORKDIR}/test/${TESTSET}.trg
endif
endif
## otherwise we give them a generic name
DEV_SRC ?= ${WORKDIR}/val/${DATASET}-dev.src
DEV_TRG ?= ${WORKDIR}/val/${DATASET}-dev.trg
TEST_SRC ?= ${WORKDIR}/test/${DATASET}-test.src
TEST_TRG ?= ${WORKDIR}/test/${DATASET}-test.trg
TEST_SRC = ${WORKDIR}/test/${TESTSET}.src
TEST_TRG = ${WORKDIR}/test/${TESTSET}.trg
## heldout data directory (keep one set per data set)
@ -326,6 +343,7 @@ endif
## make some data size-specific configuration parameters
## TODO: is it OK to delete LOCAL_TRAIN data?
local-config: ${WORKDIR}/config.mk
@ -336,25 +354,26 @@ ${WORKDIR}/config.mk:
else \
${MAKE} ${LOCAL_TRAIN_SRC}; \
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
fi; \
if [ $$s -gt 10000000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 10 million" > $@; \
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = -multipu" >> $@; \
elif [ $$s -gt 1000000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 1 million" > $@; \
echo "# ${LANGPAIRSTR} training data bigger than 1 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
elif [ $$s -gt 500000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 500k" > $@; \
echo "# ${LANGPAIRSTR} training data bigger than 500k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "MARIAN_WORKSPACE = 10000" >> $@; \
echo "BPESIZE = 12000" >> $@; \
elif [ $$s -gt 100000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 100k" > $@; \
echo "# ${LANGPAIRSTR} training data bigger than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
@ -366,7 +385,7 @@ ${WORKDIR}/config.mk:
echo "TESTSIZE = 1000" >> $@; \
echo "DEVMINSIZE = 250" >> $@; \
elif [ $$s -gt 10000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 10k" > $@; \
echo "# ${LANGPAIRSTR} training data less than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 1000" >> $@; \

View File

@ -83,8 +83,6 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN
REV_LANGSTR = ${subst ${SPACE},+,$(TRGLANGS)}-${subst ${SPACE},+,$(SRCLANGS)}
REV_WORKDIR = ${WORKHOME}/${REV_LANGSTR}
reverse-data:
ifeq (${PRE_SRC},${PRE_TRG})
ifeq (${words ${SRCLANGS}},1)
@ -142,33 +140,6 @@ ifndef NEWMODELTYPE
endif
## TODO: this does not seem to work as the config does not match
## (optmiser cannot contintue to run ....)
## move model files to a new name
## (useful if using as starting point for another modeltyp
## for example, continue training without guided alignment)
OLDMODEL_BASE = ${WORKDIR}/${MODEL}.${OLDMODELTYPE}.model${NR}
NEWMODEL_BASE = ${WORKDIR}/${MODEL}.${NEWMODELTYPE}.model${NR}
move-model:
ifeq (${wildcard ${NEWMODEL_BASE}.npz},)
cp ${OLDMODEL_BASE}.npz ${NEWMODEL_BASE}.npz
cp ${OLDMODEL_BASE}.npz.best-perplexity.npz ${NEWMODEL_BASE}.npz.best-perplexity.npz
cp ${OLDMODEL_BASE}.npz.optimizer.npz ${NEWMODEL_BASE}.npz.optimizer.npz
cp ${OLDMODEL_BASE}.npz.orig.npz ${NEWMODEL_BASE}.npz.orig.npz
cp ${OLDMODEL_BASE}.npz.progress.yml ${NEWMODEL_BASE}.npz.progress.yml
cp ${OLDMODEL_BASE}.npz.yml ${NEWMODEL_BASE}.npz.yml
sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \
< ${OLDMODEL_BASE}.npz.decoder.yml \
> ${NEWMODEL_BASE}.npz.decoder.yml
sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \
< ${OLDMODEL_BASE}.npz.best-perplexity.npz.decoder.yml \
> ${NEWMODEL_BASE}.npz.best-perplexity.npz.decoder.yml
else
@echo "new model ${NEWMODEL_BASE}.npz exists already!"
endif
clean-data:
for s in ${SRCLANGS}; do \
@ -236,30 +207,13 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
rm -f ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
## old way of word alignment with all the data in one process
## --> this may take a long time for very large corpora
## --> may also take a lot of memory (split instead, see above)
# ${TRAIN_ALG}: ${TRAIN_SRC}.${PRE_SRC}${TRAINSIZE}.gz \
# ${TRAIN_TRG}.${PRE_TRG}${TRAINSIZE}.gz
# ${MAKE} ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
# if [ `head $(LOCAL_TRAIN_SRC).algtmp | wc -l` -gt 0 ]; then \
# ${WORDALIGN} -s $(LOCAL_TRAIN_SRC).algtmp -t $(LOCAL_TRAIN_TRG).algtmp \
# --overwrite -f $(LOCAL_TRAIN_SRC).fwd -r $(LOCAL_TRAIN_TRG).rev; \
# ${ATOOLS} -c grow-diag-final -i $(LOCAL_TRAIN_SRC).fwd -j $(LOCAL_TRAIN_TRG).rev |\
# gzip -c > $@; \
# fi
# rm -f ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
# rm -f $(LOCAL_TRAIN_SRC).fwd $(LOCAL_TRAIN_TRG).rev
## copy OPUS data
## (check that the OPUS file really exists! if not, create and empty file)
##
## TODO: should e read all data from scratch using opus_read?
## TODO: should we read all data from scratch using opus_read?
## - also: langid filtering and link prob filtering?
%.${SRCEXT}.raw:
@ -298,40 +252,6 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
## - should we apply some other cleanup scripts here to get rid of some messy stuff?
# ## this is too strict for non-latin languages
# # grep -i '[a-zäöå0-9]' |\
## OLD:
##
# %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
# rm -f $@.${SRCEXT} $@.${TRGEXT}
# ln -s ${word 1,$^} $@.${SRCEXT}
# ln -s ${word 2,$^} $@.${TRGEXT}
# $(MOSESSCRIPTS)/training/clean-corpus-n.perl $@ $(SRCEXT) $(TRGEXT) ${@:.${SRCEXT}.gz=} 0 100
# rm -f $@.${SRCEXT} $@.${TRGEXT}
# paste ${@:.gz=} ${@:.${SRCEXT}.gz=.${TRGEXT}} |\
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.tmp
# rm -f ${@:.gz=} ${@:.${SRCEXT}.gz=.${TRGEXT}}
# cut -f1 $@.tmp | gzip -c > $@
# cut -f2 $@.tmp | gzip -c > ${@:.${SRCEXT}.gz=.${TRGEXT}.gz}
# rm -f $@.tmp
# %.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
# @echo "done!"
# %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
# cat $< |\
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
# gzip -c > $@
# %.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
# cat $< |\
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
# gzip -c > $@
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
@ -347,11 +267,6 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
rm -f $@.bitext $@.1 $@.2
# paste $@.${SRCEXT} $@.${TRGEXT} |\
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
# cut -f1 $@.bitext > $@
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
@echo "done!"
@ -361,6 +276,9 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
mkdir -p ${dir $@}
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
echo "" > ${dir $@}README.md
echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md
echo "" >> ${dir $@}README.md
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
@ -378,14 +296,13 @@ ${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
done
ifeq (${USE_REST_DEVDATA},1)
if [ -e ${DEV_SRC}.notused.gz ]; then \
echo "* unused dev/test data is added to training data" >> ${dir $@}README.md; \
zcat ${DEV_SRC}.notused.gz >> ${LOCAL_TRAIN_SRC}; \
zcat ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \
fi
endif
# ${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t add-to-local-train-data; \
${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC}
@echo "done!"
@ -396,7 +313,7 @@ add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "source and target are not of same lengt!"; \
echo "source and target are not of same length!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
@ -404,7 +321,11 @@ add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
echo ${CLEAN_TRAIN_TRG}; \
fi
ifneq (${CLEAN_TRAIN_SRC},)
echo "${CLEAN_TRAIN_SRC}" >> ${dir ${LOCAL_TRAIN_SRC}}/README
echo "* ${LANGPAIR}: ${TRAINSET}" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo "* ${LANGPAIR} backtranslations: ${basename ${basename ${dir ${BACKTRANS_SRC}}}}" \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
zcat ${CLEAN_TRAIN_SRC} |\
@ -423,7 +344,11 @@ endif
## extract training data but keep some heldout data for each dataset
add-to-local-train-and-heldout-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
ifneq (${CLEAN_TRAIN_SRC},)
echo "${CLEAN_TRAIN_SRC}" >> ${dir ${LOCAL_TRAIN_SRC}}/README
echo "* ${LANGPAIR}: ${TRAINSET}" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo "* ${LANGPAIR} backtranslations: ${basename ${basename ${BACKTRANS_SRC}}}" \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif
mkdir -p ${HELDOUT_DIR}/${SRC}-${TRG}
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
@ -472,6 +397,8 @@ endif
${DEV_SRC}.shuffled.gz:
mkdir -p ${dir $@}
rm -f ${DEV_SRC} ${DEV_TRG}
echo "# Validation data" > ${dir ${DEV_SRC}}/README.md
echo "" >> ${dir ${DEV_SRC}}/README.md
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
@ -524,72 +451,30 @@ else
zcat $@.shuffled.gz | cut -f1 | tail -n +$$((${DEVSIZE} + 1)) | gzip -c > ${DEV_SRC}.notused.gz
zcat $@.shuffled.gz | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | gzip -c > ${DEV_TRG}.notused.gz
endif
echo -n "devset = top " >> ${dir ${DEV_SRC}}/README
wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README
echo "" >> ${dir ${DEV_SRC}}/README.md
echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
ifeq (${DEVSET},${TESTSET})
echo -n "testset = last " >> ${dir ${TEST_SRC}}/README
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README
echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README
echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
echo "# Test data" > ${dir ${TEST_SRC}}/README.md
echo "" >> ${dir ${TEST_SRC}}/README.md
echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
endif
# zcat $@.shuffled.gz | cut -f1 | tail -${TESTSIZE} > ${TEST_SRC}; \
# zcat $@.shuffled.gz | cut -f2 | tail -${TESTSIZE} > ${TEST_TRG}; \
${DEV_TRG}: ${DEV_SRC}
@echo "done!"
### OLD: extract data from training data as dev/test set if the devdata is too small
### ---> this is confusing - skip this
###
### otherwise copy this directly after the target for ${DEV_SRC} above!
### and add dependency on train-data for ${DEV_SRC}.shuffled.gz like this:
### ${DEV_SRC}.shuffled.gz: ${TRAIN_SRC}.${PRE_SRC}.gz ${TRAIN_TRG}.${PRE_TRG}.gz
### and remove dependency on dev-data for ${LOCAL_TRAIN_SRC}, change
### ${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG} to
### ${LOCAL_TRAIN_SRC}:
#
# if (( `zcat $@.shuffled.gz | wc -l` < $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then \
# echo "Need more devdata - take some from traindata!"; \
# echo ".......... (1) extract top $$((${DEVSIZE} + ${TESTSIZE})) lines"; \
# echo "Too little dev/test data in ${DEVSET}!" >> ${dir $@}/README; \
# echo "Add top $$((${DEVSIZE} + ${TESTSIZE})) lines from ${DATASET} to dev/test" >> ${dir $@}/README; \
# echo "and remove those lines from training data" >> ${dir $@}/README; \
# zcat ${TRAIN_SRC}.${PRE_SRC}.gz | \
# head -$$((${DEVSIZE} + ${TESTSIZE})) | \
# sed 's/\@\@ //g' > $@.extra.${SRC}; \
# zcat ${TRAIN_TRG}.${PRE_TRG}.gz | \
# head -$$((${DEVSIZE} + ${TESTSIZE})) | \
# sed 's/\@\@ //g' > $@.extra.${TRG}; \
# echo ".......... (2) remaining lines for training"; \
# zcat ${TRAIN_SRC}.${PRE_SRC}.gz | \
# tail -n +$$((${DEVSIZE} + ${TESTSIZE} + 1)) | \
# sed 's/\@\@ //g' | gzip -c > $@.remaining.${SRC}.gz; \
# zcat ${TRAIN_TRG}.${PRE_TRG}.gz | \
# tail -n +$$((${DEVSIZE} + ${TESTSIZE} + 1)) | \
# sed 's/\@\@ //g' | gzip -c > $@.remaining.${TRG}.gz; \
# mv -f $@.remaining.${SRC}.gz ${TRAIN_SRC}.${PRE_SRC}.gz; \
# mv -f $@.remaining.${TRG}.gz ${TRAIN_TRG}.${PRE_TRG}.gz; \
# echo ".......... (3) append to devdata"; \
# mv $@.shuffled.gz $@.oldshuffled.gz; \
# paste $@.extra.${SRC} $@.extra.${TRG} > $@.shuffled; \
# zcat $@.oldshuffled.gz >> $@.shuffled; \
# rm $@.oldshuffled.gz; \
# gzip -f $@.shuffled; \
# rm -f $@.extra.${SRC} $@.extra.${TRG}; \
# fi
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
ifneq (${CLEAN_DEV_SRC},)
echo "* ${LANGPAIR}: ${DEVSET}" >> ${dir ${DEV_SRC}}README.md
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
zcat ${CLEAN_DEV_SRC} |\
@ -616,6 +501,8 @@ ${TEST_SRC}: ${DEV_SRC}
ifneq (${TESTSET},${DEVSET})
mkdir -p ${dir $@}
rm -f ${TEST_SRC} ${TEST_TRG}
echo "# Test data" > ${dir ${TEST_SRC}}/README.md
echo "" >> ${dir ${TEST_SRC}}/README.md
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
@ -634,7 +521,8 @@ ifneq (${TESTSET},${DEVSET})
paste ${TEST_SRC} ${TEST_TRG} | ${SHUFFLE} | gzip -c > $@.shuffled.gz; \
zcat $@.shuffled.gz | cut -f1 | tail -${TESTSIZE} > ${TEST_SRC}; \
zcat $@.shuffled.gz | cut -f2 | tail -${TESTSIZE} > ${TEST_TRG}; \
echo "testset = top ${TESTSIZE} lines of $@.shuffled!" >> ${dir $@}/README; \
echo "" >> ${dir $@}/README.md; \
echo "testset = top ${TESTSIZE} lines of $@.shuffled!" >> ${dir $@}/README.md; \
fi \
fi
else
@ -657,6 +545,7 @@ ${TEST_TRG}: ${TEST_SRC}
add-to-test-data: ${CLEAN_TEST_SRC}
ifneq (${CLEAN_TEST_SRC},)
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
zcat ${CLEAN_TEST_SRC} |\
@ -680,11 +569,6 @@ ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz: ${TRAIN_TRG}.clean.${PRE_TRG}.gz
endif
# %.clean.gz: %.gz
# mkdir -p ${TMPDIR}/${LANGPAIRSTR}/cleanup
# gzip -cd < $< > ${TMPDIR}/${LANGPAIRSTR}/cleanup/$(notdir $@).${SRCEXT}
########################
# tune data
# TODO: do we use this?
@ -907,13 +791,6 @@ MAX_NR_TOKENS = 250
mv $@.${SRCEXT} $@
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
# paste $@.${SRCEXT} $@.${TRGEXT} |\
# scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
# cut -f1 $@.bitext > $@
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
# rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
@echo "done!"
@ -1273,6 +1150,7 @@ endif
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_SRC}
mkdir -p ${dir $@}
gzip -c < $< > $@
-cat ${dir $<}README.md >> ${dir $@}README.md
ifneq (${PRE_SRC},${PRE_TRG})
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_TRG}
@ -1282,17 +1160,3 @@ endif
## make symbolic links to spm-models
## (previously we had data-specific models but now we want to re-use existing ones)
fix-spm-models:
cd work-spm; \
for l in ${ALL_LANG_PAIRS}; do \
cd $$l/train; \
if [ ! -e opus.src.spm32k-model ]; then \
ln -s *.src.spm32k-model opus.src.spm32k-model; \
ln -s *.trg.spm32k-model opus.trg.spm32k-model; \
fi; \
cd ../..; \
done

View File

@ -153,12 +153,12 @@ ifneq ("$(wildcard ${BPESRCMODEL})","")
PREPROCESS_TYPE = bpe
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
PREPROCESS_DESCRIPTION = normalization + tokenization + BPE
PREPROCESS_DESCRIPTION = normalization + tokenization + BPE (${PRE_SRC},${PRE_TRG})
else
PREPROCESS_TYPE = spm
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
PREPROCESS_DESCRIPTION = normalization + SentencePiece
PREPROCESS_DESCRIPTION = normalization + SentencePiece (${PRE_SRC},${PRE_TRG})
endif
ifneq (${words ${TRGLANGS}},1)
@ -184,6 +184,9 @@ endif
@echo '' >> ${WORKDIR}/README.md
@echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
@echo "* source language(s): ${SRCLANGS}" >> ${WORKDIR}/README.md
@echo "* target language(s): ${TRGLANGS}" >> ${WORKDIR}/README.md
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${PREPROCESS_TYPE}
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${PREPROCESS_TYPE}
@ -198,6 +201,18 @@ endif
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
if [ -e ${WORKDIR}/train/README.md ]; then \
echo -n "## Training data: " >> ${WORKDIR}/README.md; \
tr "\n" "~" < ${WORKDIR}/train/README.md |\
tr "#" "\n" | grep '${DATASET}' | \
tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
fi; \
if [ -e ${WORKDIR}/val/README.md ]; then \
echo -n "#" >> ${WORKDIR}/README.md; \
cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
fi; \
echo '## Benchmarks' >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
cd ${WORKDIR}; \

View File

@ -49,3 +49,49 @@ fix-missing-val:
fi; \
fi \
done
## TODO: this does not seem to work as the config does not match
## (optmiser cannot contintue to run ....)
## move model files to a new name
## (useful if using as starting point for another modeltyp
## for example, continue training without guided alignment)
OLDMODEL_BASE = ${WORKDIR}/${MODEL}.${OLDMODELTYPE}.model${NR}
NEWMODEL_BASE = ${WORKDIR}/${MODEL}.${NEWMODELTYPE}.model${NR}
move-model:
ifeq (${wildcard ${NEWMODEL_BASE}.npz},)
cp ${OLDMODEL_BASE}.npz ${NEWMODEL_BASE}.npz
cp ${OLDMODEL_BASE}.npz.best-perplexity.npz ${NEWMODEL_BASE}.npz.best-perplexity.npz
cp ${OLDMODEL_BASE}.npz.optimizer.npz ${NEWMODEL_BASE}.npz.optimizer.npz
cp ${OLDMODEL_BASE}.npz.orig.npz ${NEWMODEL_BASE}.npz.orig.npz
cp ${OLDMODEL_BASE}.npz.progress.yml ${NEWMODEL_BASE}.npz.progress.yml
cp ${OLDMODEL_BASE}.npz.yml ${NEWMODEL_BASE}.npz.yml
sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \
< ${OLDMODEL_BASE}.npz.decoder.yml \
> ${NEWMODEL_BASE}.npz.decoder.yml
sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \
< ${OLDMODEL_BASE}.npz.best-perplexity.npz.decoder.yml \
> ${NEWMODEL_BASE}.npz.best-perplexity.npz.decoder.yml
else
@echo "new model ${NEWMODEL_BASE}.npz exists already!"
endif
## make symbolic links to spm-models
## (previously we had data-specific models but now we want to re-use existing ones)
fix-spm-models:
cd work-spm; \
for l in ${ALL_LANG_PAIRS}; do \
cd $$l/train; \
if [ ! -e opus.src.spm32k-model ]; then \
ln -s *.src.spm32k-model opus.src.spm32k-model; \
ln -s *.trg.spm32k-model opus.trg.spm32k-model; \
fi; \
cd ../..; \
done

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| Tatoeba.gl.en | 42.5 | 0.604 |
# opus-2020-05-09.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-05-09.zip](https://object.pouta.csc.fi/OPUS-MT-models/gl-en/opus-2020-05-09.zip)
* test set translations: [opus-2020-05-09.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/gl-en/opus-2020-05-09.test.txt)
* test set scores: [opus-2020-05-09.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/gl-en/opus-2020-05-09.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.gl.en | 42.4 | 0.612 |