mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
added function to convert TMX file for fine-tuning (requires OpusTools-perl)
This commit is contained in:
parent
8ff98705b7
commit
4b7ae1a39b
@ -102,10 +102,12 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG}
|
|||||||
## monolingual data sets (for sentence piece models)
|
## monolingual data sets (for sentence piece models)
|
||||||
mono-data: ${LOCAL_MONO_DATA}.${PRE}
|
mono-data: ${LOCAL_MONO_DATA}.${PRE}
|
||||||
|
|
||||||
|
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||||
|
|
||||||
|
|
||||||
## word alignment used for guided alignment
|
## word alignment used for guided alignment
|
||||||
|
|
||||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
|
.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.algtmp ${LOCAL_TRAIN_TRG}.algtmp
|
||||||
|
|
||||||
${LOCAL_TRAIN_SRC}.algtmp: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz
|
${LOCAL_TRAIN_SRC}.algtmp: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
@ -756,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
|
|||||||
|
|
||||||
|
|
||||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
|
||||||
|
|
||||||
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||||
|
@ -24,7 +24,7 @@ scores:
|
|||||||
## in the following sub directories (add prefix work-)
|
## in the following sub directories (add prefix work-)
|
||||||
|
|
||||||
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
|
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
|
||||||
ALT_MODEL_DIR = spm
|
ALT_MODEL_DIR = spm langid
|
||||||
|
|
||||||
best_dist_all:
|
best_dist_all:
|
||||||
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
|
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
|
||||||
|
@ -20,9 +20,9 @@ endif
|
|||||||
%.submit:
|
%.submit:
|
||||||
mkdir -p ${WORKDIR}
|
mkdir -p ${WORKDIR}
|
||||||
echo '#!/bin/bash -l' > $@
|
echo '#!/bin/bash -l' > $@
|
||||||
echo '#SBATCH -J "${DATASET}-${@:.submit=}"' >>$@
|
echo '#SBATCH -J "${LANGSTR}-${DATASET}-${@:.submit=}"' >>$@
|
||||||
echo '#SBATCH -o ${DATASET}-${@:.submit=}.out.%j' >> $@
|
echo '#SBATCH -o ${LANGSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@
|
||||||
echo '#SBATCH -e ${DATASET}-${@:.submit=}.err.%j' >> $@
|
echo '#SBATCH -e ${LANGSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@
|
||||||
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
||||||
echo '#SBATCH --exclude=r18g08' >> $@
|
echo '#SBATCH --exclude=r18g08' >> $@
|
||||||
ifdef EMAIL
|
ifdef EMAIL
|
||||||
|
@ -69,6 +69,49 @@ MARIAN_EARLY_STOPPING = 5
|
|||||||
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
|
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## convert a TMX file to create dev-test-train data
|
||||||
|
## and start fine-tuning
|
||||||
|
|
||||||
|
TMX = vero-20200123.tmx.gz
|
||||||
|
tmx-tune:
|
||||||
|
zcat ${TMX} |\
|
||||||
|
tmx2moses -r -o ${TMX:.tmx.gz=}
|
||||||
|
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||||
|
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||||
|
echo $$s; echo $$t; \
|
||||||
|
mkdir -p $$s-$$t; \
|
||||||
|
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
|
||||||
|
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
|
||||||
|
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
|
||||||
|
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
|
||||||
|
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \
|
||||||
|
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||||
|
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \
|
||||||
|
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||||
|
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \
|
||||||
|
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
|
||||||
|
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \
|
||||||
|
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
|
||||||
|
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \
|
||||||
|
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||||
|
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \
|
||||||
|
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||||
|
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \
|
||||||
|
mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \
|
||||||
|
${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \
|
||||||
|
TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \
|
||||||
|
TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \
|
||||||
|
DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \
|
||||||
|
DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \
|
||||||
|
TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \
|
||||||
|
TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \
|
||||||
|
all
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.PHONY: news-enfi
|
.PHONY: news-enfi
|
||||||
news-enfi:
|
news-enfi:
|
||||||
${MAKE} SRC=en TRG=fi MODEL=news \
|
${MAKE} SRC=en TRG=fi MODEL=news \
|
||||||
|
Loading…
Reference in New Issue
Block a user