added function to convert TMX file for fine-tuning (requires OpusTools-perl)

This commit is contained in:
Joerg Tiedemann 2020-02-10 21:49:44 +02:00
parent 8ff98705b7
commit 4b7ae1a39b
4 changed files with 51 additions and 6 deletions

View File

@ -102,10 +102,12 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG}
## monolingual data sets (for sentence piece models)
mono-data: ${LOCAL_MONO_DATA}.${PRE}
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq
## word alignment used for guided alignment
.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
${LOCAL_TRAIN_SRC}.algtmp: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz
mkdir -p ${dir $@}
@ -756,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}

View File

@ -24,7 +24,7 @@ scores:
## in the following sub directories (add prefix work-)
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
ALT_MODEL_DIR = spm
ALT_MODEL_DIR = spm langid
best_dist_all:
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \

View File

@ -20,9 +20,9 @@ endif
%.submit:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "${DATASET}-${@:.submit=}"' >>$@
echo '#SBATCH -o ${DATASET}-${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e ${DATASET}-${@:.submit=}.err.%j' >> $@
echo '#SBATCH -J "${LANGSTR}-${DATASET}-${@:.submit=}"' >>$@
echo '#SBATCH -o ${LANGSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e ${LANGSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
echo '#SBATCH --exclude=r18g08' >> $@
ifdef EMAIL

View File

@ -69,6 +69,49 @@ MARIAN_EARLY_STOPPING = 5
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
## convert a TMX file to create dev-test-train data
## and start fine-tuning
TMX = vero-20200123.tmx.gz
tmx-tune:
zcat ${TMX} |\
tmx2moses -r -o ${TMX:.tmx.gz=}
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
echo $$s; echo $$t; \
mkdir -p $$s-$$t; \
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \
mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \
${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \
TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \
TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \
DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \
DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \
TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \
TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \
all
.PHONY: news-enfi
news-enfi:
${MAKE} SRC=en TRG=fi MODEL=news \