mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 22:14:14 +03:00
84 lines
2.5 KiB
Makefile
84 lines
2.5 KiB
Makefile
# -*-makefile-*-
|
|
|
|
|
|
##----------------------------------------------
|
|
## BPE
|
|
##----------------------------------------------
|
|
|
|
bpe-models: ${BPESRCMODEL} ${BPETRGMODEL}
|
|
|
|
## source/target specific bpe
|
|
## - make sure to leave the language flags alone!
|
|
## - make sure that we do not delete the BPE code files
|
|
## if the BPE models already exist
|
|
## ---> do not create new ones and always keep the old ones
|
|
## ---> need to delete the old ones if we want to create new BPE models
|
|
|
|
|
|
# BPESRCMODEL = ${TRAIN_SRC}.bpe${SRCBPESIZE:000=}k-model
|
|
# BPETRGMODEL = ${TRAIN_TRG}.bpe${TRGBPESIZE:000=}k-model
|
|
|
|
## NEW: always use the same name for the BPE models
|
|
## --> avoid overwriting validation/test data with new segmentation models
|
|
## if a new data set is used
|
|
BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
|
|
BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
|
|
|
|
|
|
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
|
|
|
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
|
# ${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
|
${BPESRCMODEL}:
|
|
${MAKE} ${LOCAL_TRAIN_SRC}
|
|
mkdir -p ${dir $@}
|
|
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
|
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
|
|
else
|
|
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} > ${LOCAL_TRAIN_SRC}.text
|
|
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC}.text > $@
|
|
rm -f ${LOCAL_TRAIN_SRC}.text
|
|
endif
|
|
|
|
|
|
## no labels on the target language side
|
|
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
|
# ${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
|
|
${BPETRGMODEL}:
|
|
${MAKE} ${LOCAL_TRAIN_TRG}
|
|
mkdir -p ${dir $@}
|
|
python3 ${SNMTPATH}/learn_bpe.py -s $(TRGBPESIZE) < ${LOCAL_TRAIN_TRG} > $@
|
|
|
|
|
|
%.src.bpe${SRCBPESIZE:000=}k: %.src ${BPESRCMODEL}
|
|
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
|
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
|
|
else
|
|
cut -f1 -d ' ' $< > $<.labels
|
|
cut -f2- -d ' ' $< > $<.txt
|
|
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $<.txt > $@.txt
|
|
paste -d ' ' $<.labels $@.txt > $@
|
|
rm -f $<.labels $<.txt $@.txt
|
|
endif
|
|
|
|
%.trg.bpe${TRGBPESIZE:000=}k: %.trg ${BPETRGMODEL}
|
|
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
|
|
|
|
|
|
## this places @@ markers in front of punctuations
|
|
## if they appear to the right of the segment boundary
|
|
## (useful if we use BPE without tokenization)
|
|
%.segfix: %
|
|
perl -pe 's/(\P{P})\@\@ (\p{P})/$$1 \@\@$$2/g' < $< > $@
|
|
|
|
|
|
|
|
%.trg.txt: %.trg
|
|
mkdir -p ${dir $@}
|
|
mv $< $@
|
|
|
|
%.src.txt: %.src
|
|
mkdir -p ${dir $@}
|
|
mv $< $@
|
|
|