From fd6db4e93afa01f886eba6033fed361898341aaf Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Thu, 19 Mar 2020 20:42:27 +0200 Subject: [PATCH] new marian and fixed path to mono lang check in backtranslation --- Makefile | 6 ++++ Makefile.config | 7 ++-- Makefile.data | 2 +- Makefile.env | 9 +++-- Makefile.simplify | 77 ++++++++++++++++++++++++++++++++++++++++++ Makefile.tasks | 71 ++++++++++++++++++++++++++++++++++++++ backtranslate/Makefile | 2 +- 7 files changed, 167 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index e6e13fd1..2c0bd469 100644 --- a/Makefile +++ b/Makefile @@ -151,6 +151,12 @@ ifeq (${MODELTYPE},transformer-align) endif +showdata: + echo ${LOCAL_TRAIN_SRC} +# echo ${CLEAN_TRAIN_SRC} +# echo ${TRAIN_SRC}.clean.${PRE_SRC}.gz +# echo ${TRAIN_TRG}.clean.${PRE_TRG}.gz + traindata: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz tunedata: ${TUNE_SRC}.${PRE_SRC} ${TUNE_TRG}.${PRE_TRG} devdata: ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} diff --git a/Makefile.config b/Makefile.config index 5e0fc355..1f89eb1a 100644 --- a/Makefile.config +++ b/Makefile.config @@ -275,9 +275,12 @@ MARIAN_EARLY_STOPPING = 10 MARIAN_VALID_MINI_BATCH = 16 MARIAN_MAXI_BATCH = 500 MARIAN_DROPOUT = 0.1 +MARIAN_MAX_LENGTH = 500 -MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src -MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src +MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \ + --max-length ${MARIAN_MAX_LENGTH} --max-length-crop +MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \ + --max-length ${MARIAN_MAX_LENGTH} --max-length-crop MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU} ## TODO: currently marianNMT crashes with workspace > 26000 diff --git a/Makefile.data b/Makefile.data index 63784267..de5e9156 100644 --- a/Makefile.data +++ b/Makefile.data @@ -1059,7 +1059,7 @@ ifneq (${SPMVOCAB},${SPMSRCVOCAB}) endif ifneq (${SPMVOCAB},${SPMTRGVOCAB}) - ${SPMSRCVOCAB}: + ${SPMTRGVOCAB}: ${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab endif diff --git a/Makefile.env b/Makefile.env index b8310682..a754dc95 100644 --- a/Makefile.env +++ b/Makefile.env @@ -93,9 +93,12 @@ else EFLOMAL_HOME = ${APPLHOME}/eflomal/ # MARIAN = ${APPLHOME}/marian/build # MARIANCPU = ${APPLHOME}/marian/build - MARIAN = ${APPLHOME}/marian-dev/build-spm - MARIANCPU = ${APPLHOME}/marian-dev/build-cpu - MARIANSPM = ${APPLHOME}/marian-dev/build-spm + # MARIAN = ${APPLHOME}/marian-dev/build-spm + # MARIANCPU = ${APPLHOME}/marian-dev/build-cpu + # MARIANSPM = ${APPLHOME}/marian-dev/build-spm + MARIAN = ${APPLHOME}/marian-dev/build-new + MARIANCPU = ${APPLHOME}/marian-dev/build-new + MARIANSPM = ${APPLHOME}/marian-dev/build-new # GPU_MODULES = cuda intel-mkl GPU = v100 GPU_MODULES = python-env diff --git a/Makefile.simplify b/Makefile.simplify index ea39e708..baf1060e 100644 --- a/Makefile.simplify +++ b/Makefile.simplify @@ -39,6 +39,47 @@ testsets/en-en/simplification.en2.gz: simplification simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz +#--------------------------------------------------------------------- +# document-level data +#--------------------------------------------------------------------- + +simplewiki-docdata: ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw \ + ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw \ + ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw \ + ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw \ + ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw \ + ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw + +${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw: ${HOME}/work/SimplifyRussian/data/simplification_datasets/simplewiki_docs.csv + mkdir -p ${dir $@} + tail -n +2 $< | cut -f2 | sed 's/^"//;s/ "$$//' > $@.en1 + tail -n +2 $< | cut -f3 | sed 's/^"//;s/ "$$//' > $@.en2 + $(MOSESSCRIPTS)/training/clean-corpus-n.perl $@ en1 en2 $@.clean 0 ${MARIAN_MAX_LENGTH} + ${TOKENIZER}/detokenizer.perl -l en < $@.clean.en1 > $@ + ${TOKENIZER}/detokenizer.perl -l en < $@.clean.en2 > $(@:.en1.raw=.en2.raw) + rm -f $@.en1 $@.en2 $@.clean.en1 $@.clean.en2 + +${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw + @echo "done!" + +${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw + head -1000 $< > $@ + +${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw + head -1000 $< > $@ + +${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw + head -2000 $< | tail -1000 > $@ + +${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw + head -2000 $< | tail -1000 > $@ + +${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw + tail -n +2001 $< > $@ + +${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw + tail -n +2001 $< > $@ + #--------------------------------------------------------------------- # data from https://cs.pomona.edu/~dkauchak/simplification/ @@ -115,6 +156,42 @@ simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} ${@:-simplewiki-v2sent-english=} +%-simplewiki-v2doc-english: simplewiki-docdata + rm -f ${WORKDIR}/*.submit + ${MAKE} DATASET=simplewiki_v2_doc \ + BPEMODELNAME=simplewiki_v2_doc${MARIAN_MAX_LENGTH} \ + TRAINSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train \ + DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \ + TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \ + HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \ + SRCLANGS=en TRGLANGS=en \ + MARIAN_VALID_FREQ=1000 \ + MARIAN_WORKSPACE=5000 \ + MARIAN_MAX_LENGTH=500 \ + HPC_MEM=12g \ + ${@:-simplewiki-v2doc-english=} + +# MARIAN_EXTRA="--max-length-crop" \ + + + + +%-simplewiki-v2sent+doc-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT} simplewiki-docdata + rm -f ${WORKDIR}/*.submit + ${MAKE} DATASET=simplewiki_v2_sent+doc${MARIAN_MAX_LENGTH} \ + BPEMODELNAME=simplewiki_v2-sent+doc${MARIAN_MAX_LENGTH} \ + TRAINSET="simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train simplewiki_v2_sent-training" \ + DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \ + TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \ + HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \ + SRCLANGS=en TRGLANGS=en \ + MARIAN_VALID_FREQ=1000 \ + MARIAN_WORKSPACE=5000 \ + HPC_MEM=16g \ + ${@:-simplewiki-v2sent+doc-english=} + + + #--------------------------------------------------------------------- # data from https://github.com/XingxingZhang/dress #--------------------------------------------------------------------- diff --git a/Makefile.tasks b/Makefile.tasks index 1725670d..7654e135 100644 --- a/Makefile.tasks +++ b/Makefile.tasks @@ -248,6 +248,77 @@ all2en: +##------------------------------------------------- +## make some tests with crawled fiskmo data + +FISKMO-DATASETS = crawl-v2-2M \ + crawl-v2-clean \ + yle-rss-v2-100K \ + yle-rss-v2-clean \ + fiskmo-crawl-articles-v1 \ + fiskmo-crawl-articles-v1-0.5 \ + fiskmo-crawl-articles-v1-0.8 \ + yle-2011-2018-articles-v1-0.8 + +## make fiskmo-fisv-data +## make fiskmo-fisv-train.submit +## make fiskmo-fisv-eval +## make fiskmo-fisv-eval-testsets +## +## make fiskmo-fisv-reverse-data +## make fiskmo-svfi-train.submit +## make fiskmo-fisv-eval +## make fiskmo-fisv-eval-testsets + +fiskmo-missing: + for d in crawl-v2-2M crawl-v2-clean; do \ + rm -f ${WORKHOME}/fi-sv/*.submit; \ + ${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \ + MODELTYPE=transformer train-dynamic; \ + done + +fiskmo-svfi-missing: + for d in crawl-v2-clean; do \ + rm -f ${WORKHOME}/sv-fi/*.submit; \ + ${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \ + train-dynamic; \ + done + + +fiskmo-fisv-%: + for d in ${FISKMO-DATASETS}; do \ + rm -f ${WORKHOME}/fi-sv/*.submit; \ + ${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \ + ${patsubst fiskmo-fisv-%,%,$@}; \ + done + rm -f ${WORKHOME}/fi-sv/*.submit + ${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=fi TRGLANGS=sv \ + TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \ + ${patsubst fiskmo-fisv-%,%,$@} + rm -f ${WORKHOME}/fi-sv/*.submit + ${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=fi TRGLANGS=sv \ + TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \ + ${patsubst fiskmo-fisv-%,%,$@} + +fiskmo-svfi-%: + for d in ${FISKMO-DATASETS}; do \ + rm -f ${WORKHOME}/sv-fi/*.submit; \ + ${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \ + ${patsubst fiskmo-svfi-%,%,$@}; \ + done + rm -f ${WORKHOME}/sv-fi/*.submit + ${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=sv TRGLANGS=fi \ + TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \ + ${patsubst fiskmo-svfi-%,%,$@} + rm -f ${WORKHOME}/sv-fi/*.submit + ${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=sv TRGLANGS=fi \ + TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \ + ${patsubst fiskmo-svfi-%,%,$@} + + +##------------------------------------------------- + + ## a batch of interesting models .... diff --git a/backtranslate/Makefile b/backtranslate/Makefile index 24bab452..007690bb 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -225,7 +225,7 @@ ${WIKI_TXT}: ${WIKI_JSON} $(TOKENIZER)/replace-unicode-punctuation.perl |\ $(TOKENIZER)/remove-non-printing-char.perl |\ sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ - python3 ../mono-match-lang.py -l ${LANGID} |\ + python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\ split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@} gzip -f ${patsubst %${PART}.gz,%,$@}*