new marian and fixed path to mono lang check in backtranslation

This commit is contained in:
Joerg Tiedemann 2020-03-19 20:42:27 +02:00
parent c573551713
commit fd6db4e93a
7 changed files with 167 additions and 7 deletions

View File

@ -151,6 +151,12 @@ ifeq (${MODELTYPE},transformer-align)
endif
showdata:
echo ${LOCAL_TRAIN_SRC}
# echo ${CLEAN_TRAIN_SRC}
# echo ${TRAIN_SRC}.clean.${PRE_SRC}.gz
# echo ${TRAIN_TRG}.clean.${PRE_TRG}.gz
traindata: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz
tunedata: ${TUNE_SRC}.${PRE_SRC} ${TUNE_TRG}.${PRE_TRG}
devdata: ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}

View File

@ -275,9 +275,12 @@ MARIAN_EARLY_STOPPING = 10
MARIAN_VALID_MINI_BATCH = 16
MARIAN_MAXI_BATCH = 500
MARIAN_DROPOUT = 0.1
MARIAN_MAX_LENGTH = 500
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
## TODO: currently marianNMT crashes with workspace > 26000

View File

@ -1059,7 +1059,7 @@ ifneq (${SPMVOCAB},${SPMSRCVOCAB})
endif
ifneq (${SPMVOCAB},${SPMTRGVOCAB})
${SPMSRCVOCAB}:
${SPMTRGVOCAB}:
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
endif

View File

@ -93,9 +93,12 @@ else
EFLOMAL_HOME = ${APPLHOME}/eflomal/
# MARIAN = ${APPLHOME}/marian/build
# MARIANCPU = ${APPLHOME}/marian/build
MARIAN = ${APPLHOME}/marian-dev/build-spm
MARIANCPU = ${APPLHOME}/marian-dev/build-cpu
MARIANSPM = ${APPLHOME}/marian-dev/build-spm
# MARIAN = ${APPLHOME}/marian-dev/build-spm
# MARIANCPU = ${APPLHOME}/marian-dev/build-cpu
# MARIANSPM = ${APPLHOME}/marian-dev/build-spm
MARIAN = ${APPLHOME}/marian-dev/build-new
MARIANCPU = ${APPLHOME}/marian-dev/build-new
MARIANSPM = ${APPLHOME}/marian-dev/build-new
# GPU_MODULES = cuda intel-mkl
GPU = v100
GPU_MODULES = python-env

View File

@ -39,6 +39,47 @@ testsets/en-en/simplification.en2.gz: simplification
simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz
#---------------------------------------------------------------------
# document-level data
#---------------------------------------------------------------------
simplewiki-docdata: ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw
${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw: ${HOME}/work/SimplifyRussian/data/simplification_datasets/simplewiki_docs.csv
mkdir -p ${dir $@}
tail -n +2 $< | cut -f2 | sed 's/^"//;s/ "$$//' > $@.en1
tail -n +2 $< | cut -f3 | sed 's/^"//;s/ "$$//' > $@.en2
$(MOSESSCRIPTS)/training/clean-corpus-n.perl $@ en1 en2 $@.clean 0 ${MARIAN_MAX_LENGTH}
${TOKENIZER}/detokenizer.perl -l en < $@.clean.en1 > $@
${TOKENIZER}/detokenizer.perl -l en < $@.clean.en2 > $(@:.en1.raw=.en2.raw)
rm -f $@.en1 $@.en2 $@.clean.en1 $@.clean.en2
${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
@echo "done!"
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
head -1000 $< > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
head -1000 $< > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
head -2000 $< | tail -1000 > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
head -2000 $< | tail -1000 > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
tail -n +2001 $< > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
tail -n +2001 $< > $@
#---------------------------------------------------------------------
# data from https://cs.pomona.edu/~dkauchak/simplification/
@ -115,6 +156,42 @@ simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
${@:-simplewiki-v2sent-english=}
%-simplewiki-v2doc-english: simplewiki-docdata
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v2_doc \
BPEMODELNAME=simplewiki_v2_doc${MARIAN_MAX_LENGTH} \
TRAINSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train \
DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \
TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \
HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \
SRCLANGS=en TRGLANGS=en \
MARIAN_VALID_FREQ=1000 \
MARIAN_WORKSPACE=5000 \
MARIAN_MAX_LENGTH=500 \
HPC_MEM=12g \
${@:-simplewiki-v2doc-english=}
# MARIAN_EXTRA="--max-length-crop" \
%-simplewiki-v2sent+doc-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT} simplewiki-docdata
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v2_sent+doc${MARIAN_MAX_LENGTH} \
BPEMODELNAME=simplewiki_v2-sent+doc${MARIAN_MAX_LENGTH} \
TRAINSET="simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train simplewiki_v2_sent-training" \
DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \
TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \
HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \
SRCLANGS=en TRGLANGS=en \
MARIAN_VALID_FREQ=1000 \
MARIAN_WORKSPACE=5000 \
HPC_MEM=16g \
${@:-simplewiki-v2sent+doc-english=}
#---------------------------------------------------------------------
# data from https://github.com/XingxingZhang/dress
#---------------------------------------------------------------------

View File

@ -248,6 +248,77 @@ all2en:
##-------------------------------------------------
## make some tests with crawled fiskmo data
FISKMO-DATASETS = crawl-v2-2M \
crawl-v2-clean \
yle-rss-v2-100K \
yle-rss-v2-clean \
fiskmo-crawl-articles-v1 \
fiskmo-crawl-articles-v1-0.5 \
fiskmo-crawl-articles-v1-0.8 \
yle-2011-2018-articles-v1-0.8
## make fiskmo-fisv-data
## make fiskmo-fisv-train.submit
## make fiskmo-fisv-eval
## make fiskmo-fisv-eval-testsets
##
## make fiskmo-fisv-reverse-data
## make fiskmo-svfi-train.submit
## make fiskmo-fisv-eval
## make fiskmo-fisv-eval-testsets
fiskmo-missing:
for d in crawl-v2-2M crawl-v2-clean; do \
rm -f ${WORKHOME}/fi-sv/*.submit; \
${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \
MODELTYPE=transformer train-dynamic; \
done
fiskmo-svfi-missing:
for d in crawl-v2-clean; do \
rm -f ${WORKHOME}/sv-fi/*.submit; \
${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \
train-dynamic; \
done
fiskmo-fisv-%:
for d in ${FISKMO-DATASETS}; do \
rm -f ${WORKHOME}/fi-sv/*.submit; \
${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \
${patsubst fiskmo-fisv-%,%,$@}; \
done
rm -f ${WORKHOME}/fi-sv/*.submit
${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=fi TRGLANGS=sv \
TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \
${patsubst fiskmo-fisv-%,%,$@}
rm -f ${WORKHOME}/fi-sv/*.submit
${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=fi TRGLANGS=sv \
TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \
${patsubst fiskmo-fisv-%,%,$@}
fiskmo-svfi-%:
for d in ${FISKMO-DATASETS}; do \
rm -f ${WORKHOME}/sv-fi/*.submit; \
${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \
${patsubst fiskmo-svfi-%,%,$@}; \
done
rm -f ${WORKHOME}/sv-fi/*.submit
${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=sv TRGLANGS=fi \
TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \
${patsubst fiskmo-svfi-%,%,$@}
rm -f ${WORKHOME}/sv-fi/*.submit
${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=sv TRGLANGS=fi \
TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \
${patsubst fiskmo-svfi-%,%,$@}
##-------------------------------------------------
## a batch of interesting models ....

View File

@ -225,7 +225,7 @@ ${WIKI_TXT}: ${WIKI_JSON}
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
python3 ../mono-match-lang.py -l ${LANGID} |\
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}*