mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
added function to convert TMX file for fine-tuning (requires OpusTools-perl)
This commit is contained in:
parent
8ff98705b7
commit
4b7ae1a39b
@ -102,6 +102,8 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG}
|
||||
## monolingual data sets (for sentence piece models)
|
||||
mono-data: ${LOCAL_MONO_DATA}.${PRE}
|
||||
|
||||
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||
|
||||
|
||||
## word alignment used for guided alignment
|
||||
|
||||
@ -756,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
|
||||
|
||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
|
||||
|
||||
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||
|
@ -24,7 +24,7 @@ scores:
|
||||
## in the following sub directories (add prefix work-)
|
||||
|
||||
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
|
||||
ALT_MODEL_DIR = spm
|
||||
ALT_MODEL_DIR = spm langid
|
||||
|
||||
best_dist_all:
|
||||
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
|
||||
|
@ -20,9 +20,9 @@ endif
|
||||
%.submit:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
echo '#SBATCH -J "${DATASET}-${@:.submit=}"' >>$@
|
||||
echo '#SBATCH -o ${DATASET}-${@:.submit=}.out.%j' >> $@
|
||||
echo '#SBATCH -e ${DATASET}-${@:.submit=}.err.%j' >> $@
|
||||
echo '#SBATCH -J "${LANGSTR}-${DATASET}-${@:.submit=}"' >>$@
|
||||
echo '#SBATCH -o ${LANGSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@
|
||||
echo '#SBATCH -e ${LANGSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@
|
||||
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
||||
echo '#SBATCH --exclude=r18g08' >> $@
|
||||
ifdef EMAIL
|
||||
|
@ -69,6 +69,49 @@ MARIAN_EARLY_STOPPING = 5
|
||||
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
|
||||
|
||||
|
||||
|
||||
## convert a TMX file to create dev-test-train data
|
||||
## and start fine-tuning
|
||||
|
||||
TMX = vero-20200123.tmx.gz
|
||||
tmx-tune:
|
||||
zcat ${TMX} |\
|
||||
tmx2moses -r -o ${TMX:.tmx.gz=}
|
||||
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||
echo $$s; echo $$t; \
|
||||
mkdir -p $$s-$$t; \
|
||||
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
|
||||
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
|
||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
|
||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
|
||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/train; \
|
||||
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s.gz; \
|
||||
head -1000 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t.gz; \
|
||||
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s.gz; \
|
||||
head -2001 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | tail -1000 | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t.gz; \
|
||||
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f1 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s.gz; \
|
||||
tail -n +2002 ${TMX:.tmx.gz=}.$$s-$$t.shuffled | cut -f2 | gzip -c \
|
||||
> $$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t.gz; \
|
||||
mv ${TMX:.tmx.gz=}.*-* $$s-$$t/; \
|
||||
${MAKE} SRC=$$s TRG=$$t MODEL=${TMX:.tmx.gz=} \
|
||||
TRAIN_SRC=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$s \
|
||||
TRAIN_TRG=$$s-$$t/${TMX:.tmx.gz=}/train/${TMX:.tmx.gz=}.$$t \
|
||||
DEV_SRC=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$s \
|
||||
DEV_TRG=$$s-$$t/${TMX:.tmx.gz=}/dev/${TMX:.tmx.gz=}.$$t \
|
||||
TEST_SRC=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$s \
|
||||
TEST_TRG=$$s-$$t/${TMX:.tmx.gz=}/test/${TMX:.tmx.gz=}.$$t \
|
||||
all
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: news-enfi
|
||||
news-enfi:
|
||||
${MAKE} SRC=en TRG=fi MODEL=news \
|
||||
|
Loading…
Reference in New Issue
Block a user