diff --git a/Makefile.config b/Makefile.config index 44443b4b..6dfe7140 100644 --- a/Makefile.config +++ b/Makefile.config @@ -73,9 +73,10 @@ ALL_MULTILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep ## size of dev data, test data and BPE merge operations +## NEW default size = 2500 (keep more for training for small languages) -DEVSIZE = 5000 -TESTSIZE = 5000 +DEVSIZE = 2500 +TESTSIZE = 2500 ## NEW: significantly reduce devminsize ## (= absolute minimum we need as devdata) @@ -132,6 +133,14 @@ ifndef DEVSET DEVSET = bible-uedin endif + +## increase dev/test sets for Tatoeba (very short sentences!) +ifeq (${DEVSET},Tatoeba) + DEVSIZE = 5000 + TESTSIZE = 5000 +endif + + ## in case we want to use some additional data sets EXTRA_TRAINSET = @@ -211,6 +220,7 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono + TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg diff --git a/Makefile.data b/Makefile.data index 51e11d7a..ff1f3ff1 100644 --- a/Makefile.data +++ b/Makefile.data @@ -13,6 +13,18 @@ ifndef THREADS endif +## look for cleanup scripts and put them into a pipe +## they should be executable and should basically read STDIN and print to STDOUT +## no further arguments are supported + +ifneq (${wildcard scripts/cleanup/${SRC}},) + SRC_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${SRC} -executable -type f}} | +endif + +ifneq (${wildcard scripts/cleanup/${TRG}},) + TRG_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${TRG} -executable -type f}} | +endif + CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}} @@ -87,7 +99,39 @@ endif endif endif +ifndef OLDMODELTYPE + OLDMODELTYPE=transformer-align +endif +ifndef NEWMODELTYPE + NEWMODELTYPE=transformer +endif + + +## move model files to a new name +## (useful if using as starting point for another modeltyp +## for example, continue training without guided alignment) + +OLDMODEL_BASE = ${WORKDIR}/${MODEL}.${OLDMODELTYPE}.model${NR} +NEWMODEL_BASE = ${WORKDIR}/${MODEL}.${NEWMODELTYPE}.model${NR} + +move-model: +ifeq (${wildcard ${NEWMODEL_BASE}.npz},) + cp ${OLDMODEL_BASE}.npz ${NEWMODEL_BASE}.npz + cp ${OLDMODEL_BASE}.npz.best-perplexity.npz ${NEWMODEL_BASE}.npz.best-perplexity.npz + cp ${OLDMODEL_BASE}.npz.optimizer.npz ${NEWMODEL_BASE}.npz.optimizer.npz + cp ${OLDMODEL_BASE}.npz.orig.npz ${NEWMODEL_BASE}.npz.orig.npz + cp ${OLDMODEL_BASE}.npz.progress.yml ${NEWMODEL_BASE}.npz.progress.yml + cp ${OLDMODEL_BASE}.npz.yml ${NEWMODEL_BASE}.npz.yml + sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \ + < ${OLDMODEL_BASE}.npz.decoder.yml \ + > ${NEWMODEL_BASE}.npz.decoder.yml + sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \ + < ${OLDMODEL_BASE}.npz.best-perplexity.npz.decoder.yml \ + > ${NEWMODEL_BASE}.npz.best-perplexity.npz.decoder.yml +else + @echo "new model ${NEWMODEL_BASE}.npz exists already!" +endif clean-data: @@ -260,7 +304,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ cat ${word 2,$^} |\ perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.2 paste $@.1 $@.2 |\ - python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext + scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext cut -f1 $@.bitext | gzip -c > $@ cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz) rm -f $@.bitext $@.1 $@.2 @@ -622,7 +666,7 @@ add-to-local-mono-data: for c in ${MONOSET}; do \ if [ -e ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz ]; then \ zcat ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz |\ - python3 mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \ + scripts/filter/mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \ fi \ done @@ -666,6 +710,9 @@ add-to-local-mono-data: +### TODO: make language-specific pre-processing .... +### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS + ## only normalisation %.norm: %.raw $(LOAD_MOSES) cat $< |\ @@ -674,13 +721,6 @@ add-to-local-mono-data: $(TOKENIZER)/normalize-punctuation.perl |\ sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@ -%.norm.gz: %.gz - $(LOAD_MOSES) zcat $< |\ - $(TOKENIZER)/replace-unicode-punctuation.perl |\ - $(TOKENIZER)/remove-non-printing-char.perl |\ - $(TOKENIZER)/normalize-punctuation.perl |\ - sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ - ## minimal pre-processing %.simple: %.raw @@ -690,12 +730,6 @@ add-to-local-mono-data: $(TOKENIZER)/deescape-special-chars.perl |\ sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@ -%.simple.gz: %.gz - $(LOAD_MOSES) zcat $< |\ - $(TOKENIZER)/replace-unicode-punctuation.perl |\ - $(TOKENIZER)/remove-non-printing-char.perl |\ - $(TOKENIZER)/deescape-special-chars.perl |\ - sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ ## remove all spaces (treat everything as a long string) %.nospace: %.raw @@ -706,14 +740,49 @@ add-to-local-mono-data: sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ sed 's/ /▁/g' > $@ -%.nospace.gz: %.gz - $(LOAD_MOSES) zcat $< |\ - $(TOKENIZER)/replace-unicode-punctuation.perl |\ - $(TOKENIZER)/remove-non-printing-char.perl |\ - $(TOKENIZER)/deescape-special-chars.perl |\ - sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ - sed 's/ /▁/g' |\ - gzip -c > $@ + +## generic targets to make it possible to work with compressed data +## when running the same pre-processing pipeline +## TODO: does that destroy anything? +## TODO: do we need this? + +# %.raw: %.gz +# gzip -cd < $< > $@ + +# %.${PRE}.gz: %.${PRE} +# gzip -c < $< > $@ + + + + + + +## the above should avoid having repeating the pipeline below + +# %.norm.gz: %.gz +# $(LOAD_MOSES) zcat $< |\ +# $(TOKENIZER)/replace-unicode-punctuation.perl |\ +# $(TOKENIZER)/remove-non-printing-char.perl |\ +# $(TOKENIZER)/normalize-punctuation.perl |\ +# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ + +# %.simple.gz: %.gz +# $(LOAD_MOSES) zcat $< |\ +# $(TOKENIZER)/replace-unicode-punctuation.perl |\ +# $(TOKENIZER)/remove-non-printing-char.perl |\ +# $(TOKENIZER)/deescape-special-chars.perl |\ +# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ + +# %.nospace.gz: %.gz +# $(LOAD_MOSES) zcat $< |\ +# $(TOKENIZER)/replace-unicode-punctuation.perl |\ +# $(TOKENIZER)/remove-non-printing-char.perl |\ +# $(TOKENIZER)/deescape-special-chars.perl |\ +# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ +# sed 's/ /▁/g' |\ +# gzip -c > $@ + + @@ -733,7 +802,7 @@ MAX_NR_TOKENS = 250 mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG}) # paste $@.${SRCEXT} $@.${TRGEXT} |\ -# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext +# scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext # cut -f1 $@.bitext > $@ # cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG}) # rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext diff --git a/Makefile.env b/Makefile.env index aafcbadc..b8310682 100644 --- a/Makefile.env +++ b/Makefile.env @@ -81,8 +81,9 @@ else ifneq ($(wildcard /wrk/tiedeman/research),) MARIANCPU = ${HOME}/appl_taito/tools/marian/build-cpu LOADMODS = ${LOADGPU} else - CSCPROJECT = project_2001194 + # CSCPROJECT = project_2001194 # CSCPROJECT = project_2000309 + CSCPROJECT = project_2002688 DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR} WORKHOME = ${shell realpath ${PWD}/work-langid} APPLHOME = ${HOME}/projappl diff --git a/Makefile.simplify b/Makefile.simplify new file mode 100644 index 00000000..30366624 --- /dev/null +++ b/Makefile.simplify @@ -0,0 +1,46 @@ +# -*-makefile-*- + + + +SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/ +SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/ + +SIMPLEWIKI_DATA1 = data.v1.split +SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2 +SIMPLEWIKI_DATA2_DOC = document-aligned.v2 + + +${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}: + mkdir -p ${dir $@} + wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz + tar -C ${dir $@} -xzf $@.tar.gz + rm -f $@.tar.gz + ${TOKENIZER}/detokenizer.perl -l en < $@/normal.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en1.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/simple.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en2.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/normal.tuning.txt > ${DATADIR}/${PRE}/simplewiki_v1-tuning.en-en.en1.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/simple.tuning.txt > ${DATADIR}/${PRE}/simplewiki_v1-tuning.en-en.en2.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/normal.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en1.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw + + +${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en1.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/normal.%.txt + mkdir -p ${dir $@} + ${TOKENIZER}/detokenizer.perl -l en < $< > $@ + +${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en2.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/simple.%.txt + mkdir -p ${dir $@} + ${TOKENIZER}/detokenizer.perl -l en < $< > $@ + +simplify-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} + +## train a simplification model from simplewiki for English + +%-simplify-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} + ${MAKE} DATASET=simplewiki_v1 \ + TRAINSET=simplewiki_v1-training \ + DEVSET=simplewiki_v1-tuning \ + TESTSET=simplewiki_v1-testing \ + HELDOUTSIZE=0 \ + SRCLANGS=en TRGLANGS=en \ + ${@:-simplify-english=} + diff --git a/Makefile.tasks b/Makefile.tasks index 5472dfda..1725670d 100644 --- a/Makefile.tasks +++ b/Makefile.tasks @@ -3,6 +3,7 @@ # pre-defined tasks that we might want to run # +include Makefile.simplify MEMAD_LANGS = de en fi fr nl sv @@ -235,12 +236,6 @@ all2en: - - - - - - ## run things with individual data sets only %-fiskmo: ${MAKE} TRAINSET=fiskmo ${@:-fiskmo=} @@ -289,7 +284,7 @@ enru-yandex: enru-yandex-bt: ${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data-bt ${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \ - WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train-bt.submit-multigpu + WALLTIME=72 HPC_CORES=1 HPC_MEM=12g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train-bt.submit-multigpu enit: diff --git a/scripts/cleanup/ml/normalize.sed b/scripts/cleanup/ml/normalize.sed new file mode 100644 index 00000000..f4e09dc2 --- /dev/null +++ b/scripts/cleanup/ml/normalize.sed @@ -0,0 +1,33 @@ +# Misc clean up on corpus +# sed -i -f corpora-cleanup.sed corpus/*.txt +# Chillu normalization +s/ന്‍/ൻ/g +s/ള്‍/ൾ/g +s/ല്‍/ൽ/g +s/ര്‍/ർ/g +s/ന്‍/ൻ/g +s/ണ്‍/ൺ/g +# Remove ZWNJ at end of words +s/\xE2\x80\x8C$//g +# Remove all other ZWJ +s/\xE2\x80\x8D//g +# Remove all soft hyphens +s/\xC2\xAD//g +# Replace old au sign with new one +s/‍ൌ/ൗ/g + +#Common mistakes +s/പക്ഷെ/പക്ഷേ/g +# ZWNJs +s/ു‌/ു/g +s/ി‌/ു/g +s/ോ‌/ോ/g +s/ാ‌/ാ/g +s/ഒാ/ഓ/g +# ൻറെ -> ന്റെ at the end of words +s/ൻറെ/ന്റെ/g +s/ൻറ്$/ന്റ്/g +s/ൻറും$/ന്റും/g +s/ൻറിൽ$/ന്റിൽ/g +# ുൻപോൾ - ുമ്പോൾ +s/ുൻപോൾ/ുമ്പോൾ/g \ No newline at end of file diff --git a/scripts/cleanup/ml/normalize.sh b/scripts/cleanup/ml/normalize.sh new file mode 100755 index 00000000..3d7ce804 --- /dev/null +++ b/scripts/cleanup/ml/normalize.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +sed -f `dirname "$0"`/normalize.sed + diff --git a/bitext-match-lang.py b/scripts/filter/bitext-match-lang.py similarity index 100% rename from bitext-match-lang.py rename to scripts/filter/bitext-match-lang.py diff --git a/mono-match-lang.py b/scripts/filter/mono-match-lang.py similarity index 100% rename from mono-match-lang.py rename to scripts/filter/mono-match-lang.py