mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
new marian and fixed path to mono lang check in backtranslation
This commit is contained in:
parent
c573551713
commit
fd6db4e93a
6
Makefile
6
Makefile
@ -151,6 +151,12 @@ ifeq (${MODELTYPE},transformer-align)
|
||||
endif
|
||||
|
||||
|
||||
showdata:
|
||||
echo ${LOCAL_TRAIN_SRC}
|
||||
# echo ${CLEAN_TRAIN_SRC}
|
||||
# echo ${TRAIN_SRC}.clean.${PRE_SRC}.gz
|
||||
# echo ${TRAIN_TRG}.clean.${PRE_TRG}.gz
|
||||
|
||||
traindata: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz
|
||||
tunedata: ${TUNE_SRC}.${PRE_SRC} ${TUNE_TRG}.${PRE_TRG}
|
||||
devdata: ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
|
||||
|
@ -275,9 +275,12 @@ MARIAN_EARLY_STOPPING = 10
|
||||
MARIAN_VALID_MINI_BATCH = 16
|
||||
MARIAN_MAXI_BATCH = 500
|
||||
MARIAN_DROPOUT = 0.1
|
||||
MARIAN_MAX_LENGTH = 500
|
||||
|
||||
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src
|
||||
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src
|
||||
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
|
||||
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
|
||||
|
||||
## TODO: currently marianNMT crashes with workspace > 26000
|
||||
|
@ -1059,7 +1059,7 @@ ifneq (${SPMVOCAB},${SPMSRCVOCAB})
|
||||
endif
|
||||
|
||||
ifneq (${SPMVOCAB},${SPMTRGVOCAB})
|
||||
${SPMSRCVOCAB}:
|
||||
${SPMTRGVOCAB}:
|
||||
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
|
||||
endif
|
||||
|
||||
|
@ -93,9 +93,12 @@ else
|
||||
EFLOMAL_HOME = ${APPLHOME}/eflomal/
|
||||
# MARIAN = ${APPLHOME}/marian/build
|
||||
# MARIANCPU = ${APPLHOME}/marian/build
|
||||
MARIAN = ${APPLHOME}/marian-dev/build-spm
|
||||
MARIANCPU = ${APPLHOME}/marian-dev/build-cpu
|
||||
MARIANSPM = ${APPLHOME}/marian-dev/build-spm
|
||||
# MARIAN = ${APPLHOME}/marian-dev/build-spm
|
||||
# MARIANCPU = ${APPLHOME}/marian-dev/build-cpu
|
||||
# MARIANSPM = ${APPLHOME}/marian-dev/build-spm
|
||||
MARIAN = ${APPLHOME}/marian-dev/build-new
|
||||
MARIANCPU = ${APPLHOME}/marian-dev/build-new
|
||||
MARIANSPM = ${APPLHOME}/marian-dev/build-new
|
||||
# GPU_MODULES = cuda intel-mkl
|
||||
GPU = v100
|
||||
GPU_MODULES = python-env
|
||||
|
@ -39,6 +39,47 @@ testsets/en-en/simplification.en2.gz: simplification
|
||||
simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz
|
||||
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# document-level data
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
simplewiki-docdata: ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw \
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw \
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw \
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw \
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw \
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw
|
||||
|
||||
${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw: ${HOME}/work/SimplifyRussian/data/simplification_datasets/simplewiki_docs.csv
|
||||
mkdir -p ${dir $@}
|
||||
tail -n +2 $< | cut -f2 | sed 's/^"//;s/ "$$//' > $@.en1
|
||||
tail -n +2 $< | cut -f3 | sed 's/^"//;s/ "$$//' > $@.en2
|
||||
$(MOSESSCRIPTS)/training/clean-corpus-n.perl $@ en1 en2 $@.clean 0 ${MARIAN_MAX_LENGTH}
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@.clean.en1 > $@
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@.clean.en2 > $(@:.en1.raw=.en2.raw)
|
||||
rm -f $@.en1 $@.en2 $@.clean.en1 $@.clean.en2
|
||||
|
||||
${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
|
||||
@echo "done!"
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
|
||||
head -1000 $< > $@
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
|
||||
head -1000 $< > $@
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
|
||||
head -2000 $< | tail -1000 > $@
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
|
||||
head -2000 $< | tail -1000 > $@
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
|
||||
tail -n +2001 $< > $@
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
|
||||
tail -n +2001 $< > $@
|
||||
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# data from https://cs.pomona.edu/~dkauchak/simplification/
|
||||
@ -115,6 +156,42 @@ simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||
${@:-simplewiki-v2sent-english=}
|
||||
|
||||
|
||||
%-simplewiki-v2doc-english: simplewiki-docdata
|
||||
rm -f ${WORKDIR}/*.submit
|
||||
${MAKE} DATASET=simplewiki_v2_doc \
|
||||
BPEMODELNAME=simplewiki_v2_doc${MARIAN_MAX_LENGTH} \
|
||||
TRAINSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train \
|
||||
DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \
|
||||
TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \
|
||||
HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \
|
||||
SRCLANGS=en TRGLANGS=en \
|
||||
MARIAN_VALID_FREQ=1000 \
|
||||
MARIAN_WORKSPACE=5000 \
|
||||
MARIAN_MAX_LENGTH=500 \
|
||||
HPC_MEM=12g \
|
||||
${@:-simplewiki-v2doc-english=}
|
||||
|
||||
# MARIAN_EXTRA="--max-length-crop" \
|
||||
|
||||
|
||||
|
||||
|
||||
%-simplewiki-v2sent+doc-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT} simplewiki-docdata
|
||||
rm -f ${WORKDIR}/*.submit
|
||||
${MAKE} DATASET=simplewiki_v2_sent+doc${MARIAN_MAX_LENGTH} \
|
||||
BPEMODELNAME=simplewiki_v2-sent+doc${MARIAN_MAX_LENGTH} \
|
||||
TRAINSET="simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train simplewiki_v2_sent-training" \
|
||||
DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \
|
||||
TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \
|
||||
HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \
|
||||
SRCLANGS=en TRGLANGS=en \
|
||||
MARIAN_VALID_FREQ=1000 \
|
||||
MARIAN_WORKSPACE=5000 \
|
||||
HPC_MEM=16g \
|
||||
${@:-simplewiki-v2sent+doc-english=}
|
||||
|
||||
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# data from https://github.com/XingxingZhang/dress
|
||||
#---------------------------------------------------------------------
|
||||
|
@ -248,6 +248,77 @@ all2en:
|
||||
|
||||
|
||||
|
||||
##-------------------------------------------------
|
||||
## make some tests with crawled fiskmo data
|
||||
|
||||
FISKMO-DATASETS = crawl-v2-2M \
|
||||
crawl-v2-clean \
|
||||
yle-rss-v2-100K \
|
||||
yle-rss-v2-clean \
|
||||
fiskmo-crawl-articles-v1 \
|
||||
fiskmo-crawl-articles-v1-0.5 \
|
||||
fiskmo-crawl-articles-v1-0.8 \
|
||||
yle-2011-2018-articles-v1-0.8
|
||||
|
||||
## make fiskmo-fisv-data
|
||||
## make fiskmo-fisv-train.submit
|
||||
## make fiskmo-fisv-eval
|
||||
## make fiskmo-fisv-eval-testsets
|
||||
##
|
||||
## make fiskmo-fisv-reverse-data
|
||||
## make fiskmo-svfi-train.submit
|
||||
## make fiskmo-fisv-eval
|
||||
## make fiskmo-fisv-eval-testsets
|
||||
|
||||
fiskmo-missing:
|
||||
for d in crawl-v2-2M crawl-v2-clean; do \
|
||||
rm -f ${WORKHOME}/fi-sv/*.submit; \
|
||||
${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \
|
||||
MODELTYPE=transformer train-dynamic; \
|
||||
done
|
||||
|
||||
fiskmo-svfi-missing:
|
||||
for d in crawl-v2-clean; do \
|
||||
rm -f ${WORKHOME}/sv-fi/*.submit; \
|
||||
${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \
|
||||
train-dynamic; \
|
||||
done
|
||||
|
||||
|
||||
fiskmo-fisv-%:
|
||||
for d in ${FISKMO-DATASETS}; do \
|
||||
rm -f ${WORKHOME}/fi-sv/*.submit; \
|
||||
${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \
|
||||
${patsubst fiskmo-fisv-%,%,$@}; \
|
||||
done
|
||||
rm -f ${WORKHOME}/fi-sv/*.submit
|
||||
${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=fi TRGLANGS=sv \
|
||||
TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \
|
||||
${patsubst fiskmo-fisv-%,%,$@}
|
||||
rm -f ${WORKHOME}/fi-sv/*.submit
|
||||
${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=fi TRGLANGS=sv \
|
||||
TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \
|
||||
${patsubst fiskmo-fisv-%,%,$@}
|
||||
|
||||
fiskmo-svfi-%:
|
||||
for d in ${FISKMO-DATASETS}; do \
|
||||
rm -f ${WORKHOME}/sv-fi/*.submit; \
|
||||
${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \
|
||||
${patsubst fiskmo-svfi-%,%,$@}; \
|
||||
done
|
||||
rm -f ${WORKHOME}/sv-fi/*.submit
|
||||
${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=sv TRGLANGS=fi \
|
||||
TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \
|
||||
${patsubst fiskmo-svfi-%,%,$@}
|
||||
rm -f ${WORKHOME}/sv-fi/*.submit
|
||||
${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=sv TRGLANGS=fi \
|
||||
TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \
|
||||
${patsubst fiskmo-svfi-%,%,$@}
|
||||
|
||||
|
||||
##-------------------------------------------------
|
||||
|
||||
|
||||
## a batch of interesting models ....
|
||||
|
||||
|
||||
|
@ -225,7 +225,7 @@ ${WIKI_TXT}: ${WIKI_JSON}
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
python3 ../mono-match-lang.py -l ${LANGID} |\
|
||||
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
|
||||
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
|
||||
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user