From 4b7ae1a39bbc7c30df254509b53eefcbb274bf32 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Mon, 10 Feb 2020 21:49:44 +0200 Subject: [PATCH] added function to convert TMX file for fine-tuning (requires OpusTools-perl) --- Makefile.data | 6 ++++-- Makefile.dist | 2 +- Makefile.slurm | 6 +++--- finetune/Makefile | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/Makefile.data b/Makefile.data index 4f745fde..cb40eca3 100644 --- a/Makefile.data +++ b/Makefile.data @@ -102,10 +102,12 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG} ## monolingual data sets (for sentence piece models) mono-data: ${LOCAL_MONO_DATA}.${PRE} +.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq + ## word alignment used for guided alignment -.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp +.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp ${LOCAL_TRAIN_SRC}.algtmp: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz mkdir -p ${dir $@} @@ -756,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model .PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL} -.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} +.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq # ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% ${BPESRCMODEL}: ${LOCAL_TRAIN_SRC} diff --git a/Makefile.dist b/Makefile.dist index 443d57d1..7ae1b3ef 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -24,7 +24,7 @@ scores: ## in the following sub directories (add prefix work-) # ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm -ALT_MODEL_DIR = spm +ALT_MODEL_DIR = spm langid best_dist_all: for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \ diff --git a/Makefile.slurm b/Makefile.slurm index 3b0ff36b..55fd27d1 100644 --- a/Makefile.slurm +++ b/Makefile.slurm @@ -20,9 +20,9 @@ endif %.submit: mkdir -p ${WORKDIR} echo '#!/bin/bash -l' > $@ - echo '#SBATCH -J "${DATASET}-${@:.submit=}"' >>$@ - echo '#SBATCH -o ${DATASET}-${@:.submit=}.out.%j' >> $@ - echo '#SBATCH -e ${DATASET}-${@:.submit=}.err.%j' >> $@ + echo '#SBATCH -J "${LANGSTR}-${DATASET}-${@:.submit=}"' >>$@ + echo '#SBATCH -o ${LANGSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@ + echo '#SBATCH -e ${LANGSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@ echo '#SBATCH --mem=${HPC_MEM}' >> $@ echo '#SBATCH --exclude=r18g08' >> $@ ifdef EMAIL diff --git a/finetune/Makefile b/finetune/Makefile index c1b9699e..afb6190c 100644 --- a/finetune/Makefile +++ b/finetune/Makefile @@ -69,6 +69,49 @@ MARIAN_EARLY_STOPPING = 5 all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare + +## convert a TMX file to create dev-test-train data +## and start fine-tuning + +TMX = vero-20200123.tmx.gz +tmx-tune: + zcat ${TMX} |\ + tmx2moses -r -o ${TMX:.tmx.gz=} + s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \ + t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \ + echo $$s; echo $$t; \ + mkdir -p $$s-$$t; \ + paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \ + shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \ + mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \ + mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \ + mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \ + head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \ + > $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \ + head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \ + > $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \ + head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \ + > $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \ + head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \ + > $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \ + tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \ + > $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \ + tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \ + > $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \ + mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \ + ${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \ + TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \ + TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \ + DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \ + DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \ + TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \ + TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \ + all + + + + + .PHONY: news-enfi news-enfi: ${MAKE} SRC=en TRG=fi MODEL=news \