train text simplification model

This commit is contained in:
Joerg Tiedemann 2020-02-29 17:59:27 +02:00
parent 2805bf49e7
commit 0ff0e625d5
9 changed files with 192 additions and 34 deletions

View File

@ -73,9 +73,10 @@ ALL_MULTILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep
## size of dev data, test data and BPE merge operations
## NEW default size = 2500 (keep more for training for small languages)
DEVSIZE = 5000
TESTSIZE = 5000
DEVSIZE = 2500
TESTSIZE = 2500
## NEW: significantly reduce devminsize
## (= absolute minimum we need as devdata)
@ -132,6 +133,14 @@ ifndef DEVSET
DEVSET = bible-uedin
endif
## increase dev/test sets for Tatoeba (very short sentences!)
ifeq (${DEVSET},Tatoeba)
DEVSIZE = 5000
TESTSIZE = 5000
endif
## in case we want to use some additional data sets
EXTRA_TRAINSET =
@ -211,6 +220,7 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg

View File

@ -13,6 +13,18 @@ ifndef THREADS
endif
## look for cleanup scripts and put them into a pipe
## they should be executable and should basically read STDIN and print to STDOUT
## no further arguments are supported
ifneq (${wildcard scripts/cleanup/${SRC}},)
SRC_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${SRC} -executable -type f}} |
endif
ifneq (${wildcard scripts/cleanup/${TRG}},)
TRG_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${TRG} -executable -type f}} |
endif
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
@ -87,7 +99,39 @@ endif
endif
endif
ifndef OLDMODELTYPE
OLDMODELTYPE=transformer-align
endif
ifndef NEWMODELTYPE
NEWMODELTYPE=transformer
endif
## move model files to a new name
## (useful if using as starting point for another modeltyp
## for example, continue training without guided alignment)
OLDMODEL_BASE = ${WORKDIR}/${MODEL}.${OLDMODELTYPE}.model${NR}
NEWMODEL_BASE = ${WORKDIR}/${MODEL}.${NEWMODELTYPE}.model${NR}
move-model:
ifeq (${wildcard ${NEWMODEL_BASE}.npz},)
cp ${OLDMODEL_BASE}.npz ${NEWMODEL_BASE}.npz
cp ${OLDMODEL_BASE}.npz.best-perplexity.npz ${NEWMODEL_BASE}.npz.best-perplexity.npz
cp ${OLDMODEL_BASE}.npz.optimizer.npz ${NEWMODEL_BASE}.npz.optimizer.npz
cp ${OLDMODEL_BASE}.npz.orig.npz ${NEWMODEL_BASE}.npz.orig.npz
cp ${OLDMODEL_BASE}.npz.progress.yml ${NEWMODEL_BASE}.npz.progress.yml
cp ${OLDMODEL_BASE}.npz.yml ${NEWMODEL_BASE}.npz.yml
sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \
< ${OLDMODEL_BASE}.npz.decoder.yml \
> ${NEWMODEL_BASE}.npz.decoder.yml
sed 's/${OLDMODELTYPE}/${NEWMODELTYPE}/' \
< ${OLDMODEL_BASE}.npz.best-perplexity.npz.decoder.yml \
> ${NEWMODEL_BASE}.npz.best-perplexity.npz.decoder.yml
else
@echo "new model ${NEWMODEL_BASE}.npz exists already!"
endif
clean-data:
@ -260,7 +304,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
cat ${word 2,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.2
paste $@.1 $@.2 |\
python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
cut -f1 $@.bitext | gzip -c > $@
cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
rm -f $@.bitext $@.1 $@.2
@ -622,7 +666,7 @@ add-to-local-mono-data:
for c in ${MONOSET}; do \
if [ -e ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz ]; then \
zcat ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz |\
python3 mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \
scripts/filter/mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \
fi \
done
@ -666,6 +710,9 @@ add-to-local-mono-data:
### TODO: make language-specific pre-processing ....
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
## only normalisation
%.norm: %.raw
$(LOAD_MOSES) cat $< |\
@ -674,13 +721,6 @@ add-to-local-mono-data:
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
%.norm.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
## minimal pre-processing
%.simple: %.raw
@ -690,12 +730,6 @@ add-to-local-mono-data:
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
%.simple.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
## remove all spaces (treat everything as a long string)
%.nospace: %.raw
@ -706,14 +740,49 @@ add-to-local-mono-data:
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
sed 's/ /▁/g' > $@
%.nospace.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
sed 's/ /▁/g' |\
gzip -c > $@
## generic targets to make it possible to work with compressed data
## when running the same pre-processing pipeline
## TODO: does that destroy anything?
## TODO: do we need this?
# %.raw: %.gz
# gzip -cd < $< > $@
# %.${PRE}.gz: %.${PRE}
# gzip -c < $< > $@
## the above should avoid having repeating the pipeline below
# %.norm.gz: %.gz
# $(LOAD_MOSES) zcat $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/normalize-punctuation.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
# %.simple.gz: %.gz
# $(LOAD_MOSES) zcat $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/deescape-special-chars.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
# %.nospace.gz: %.gz
# $(LOAD_MOSES) zcat $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/deescape-special-chars.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
# sed 's/ /▁/g' |\
# gzip -c > $@
@ -733,7 +802,7 @@ MAX_NR_TOKENS = 250
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
# paste $@.${SRCEXT} $@.${TRGEXT} |\
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
# scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
# cut -f1 $@.bitext > $@
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
# rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext

View File

@ -81,8 +81,9 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
MARIANCPU = ${HOME}/appl_taito/tools/marian/build-cpu
LOADMODS = ${LOADGPU}
else
CSCPROJECT = project_2001194
# CSCPROJECT = project_2001194
# CSCPROJECT = project_2000309
CSCPROJECT = project_2002688
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
WORKHOME = ${shell realpath ${PWD}/work-langid}
APPLHOME = ${HOME}/projappl

46
Makefile.simplify Normal file
View File

@ -0,0 +1,46 @@
# -*-makefile-*-
SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/
SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/
SIMPLEWIKI_DATA1 = data.v1.split
SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2
SIMPLEWIKI_DATA2_DOC = document-aligned.v2
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
mkdir -p ${dir $@}
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
tar -C ${dir $@} -xzf $@.tar.gz
rm -f $@.tar.gz
${TOKENIZER}/detokenizer.perl -l en < $@/normal.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/simple.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/normal.tuning.txt > ${DATADIR}/${PRE}/simplewiki_v1-tuning.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/simple.tuning.txt > ${DATADIR}/${PRE}/simplewiki_v1-tuning.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/normal.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en1.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/normal.%.txt
mkdir -p ${dir $@}
${TOKENIZER}/detokenizer.perl -l en < $< > $@
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en2.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/simple.%.txt
mkdir -p ${dir $@}
${TOKENIZER}/detokenizer.perl -l en < $< > $@
simplify-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
## train a simplification model from simplewiki for English
%-simplify-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
${MAKE} DATASET=simplewiki_v1 \
TRAINSET=simplewiki_v1-training \
DEVSET=simplewiki_v1-tuning \
TESTSET=simplewiki_v1-testing \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplify-english=}

View File

@ -3,6 +3,7 @@
# pre-defined tasks that we might want to run
#
include Makefile.simplify
MEMAD_LANGS = de en fi fr nl sv
@ -235,12 +236,6 @@ all2en:
## run things with individual data sets only
%-fiskmo:
${MAKE} TRAINSET=fiskmo ${@:-fiskmo=}
@ -289,7 +284,7 @@ enru-yandex:
enru-yandex-bt:
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data-bt
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train-bt.submit-multigpu
WALLTIME=72 HPC_CORES=1 HPC_MEM=12g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train-bt.submit-multigpu
enit:

View File

@ -0,0 +1,33 @@
# Misc clean up on corpus
# sed -i -f corpora-cleanup.sed corpus/*.txt
# Chillu normalization
s/ന്‍//g
s/ള്‍//g
s/ല്‍//g
s/ര്‍//g
s/ന്‍//g
s/ണ്‍//g
# Remove ZWNJ at end of words
s/\xE2\x80\x8C$//g
# Remove all other ZWJ
s/\xE2\x80\x8D//g
# Remove all soft hyphens
s/\xC2\xAD//g
# Replace old au sign with new one
s/‍ൌ//g
#Common mistakes
s/പക്ഷെ/പക്ഷേ/g
# ZWNJs
s/ു‌//g
s/ി‌//g
s/ോ‌//g
s/ാ‌//g
s/ഒാ//g
# ൻറെ -> ന്റെ at the end of words
s/ൻറെ/ന്റെ/g
s/ൻറ്$/ന്റ്/g
s/ൻറും$/ന്റും/g
s/ൻറിൽ$/ന്റിൽ/g
# ുൻപോൾ - ുമ്പോൾ
s/ുൻപോൾ/ുമ്പോൾ/g

View File

@ -0,0 +1,4 @@
#!/bin/bash
sed -f `dirname "$0"`/normalize.sed