2020-06-03 15:39:18 +03:00
|
|
|
# -*-makefile-*-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
##----------------------------------------------
|
|
|
|
## sentence piece
|
|
|
|
##----------------------------------------------
|
|
|
|
|
|
|
|
spm-models: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
|
|
|
|
|
|
|
# SPMSRCMODEL = ${TRAIN_SRC}.spm${SRCBPESIZE:000=}k-model
|
|
|
|
# SPMTRGMODEL = ${TRAIN_TRG}.spm${TRGBPESIZE:000=}k-model
|
|
|
|
|
|
|
|
## NEW: always use the same name for the SPM models
|
|
|
|
## --> avoid overwriting validation/test data with new segmentation models
|
|
|
|
## if a new data set is used
|
|
|
|
SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
|
|
|
|
SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
|
|
|
|
# SPMEXTRA = --split_by_whitespace=false
|
|
|
|
SPMEXTRA =
|
|
|
|
|
|
|
|
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
|
|
|
|
|
|
|
GENERATE_SPM_VOC = 0
|
|
|
|
|
|
|
|
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
|
|
|
${SPMSRCMODEL}:
|
|
|
|
${MAKE} ${LOCAL_TRAIN_SRC}
|
|
|
|
mkdir -p ${dir $@}
|
2020-06-06 20:49:54 +03:00
|
|
|
ifeq (${USE_TARGET_LABELS},1)
|
2020-06-03 15:39:18 +03:00
|
|
|
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} | grep . | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
|
2020-06-06 20:49:54 +03:00
|
|
|
else
|
|
|
|
grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
|
2020-06-03 15:39:18 +03:00
|
|
|
endif
|
|
|
|
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
|
|
|
|
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
|
|
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
|
|
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=${LOCAL_TRAIN_SRC}.text \
|
|
|
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
|
|
|
else \
|
|
|
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
|
|
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=${LOCAL_TRAIN_SRC}.text \
|
|
|
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
|
|
|
fi
|
|
|
|
mv $@.model $@
|
|
|
|
ifeq (${GENERATE_SPM_VOC},1)
|
|
|
|
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < ${LOCAL_TRAIN_SRC}.text > $@.voc
|
|
|
|
endif
|
|
|
|
rm -f ${LOCAL_TRAIN_SRC}.text
|
|
|
|
|
|
|
|
|
|
|
|
## no labels on the target language side
|
|
|
|
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
|
|
|
${SPMTRGMODEL}:
|
|
|
|
${MAKE} ${LOCAL_TRAIN_TRG}
|
|
|
|
mkdir -p ${dir $@}
|
|
|
|
grep . ${LOCAL_TRAIN_TRG} | ${SHUFFLE} > ${LOCAL_TRAIN_TRG}.text
|
|
|
|
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
|
|
|
|
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
|
|
|
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
|
|
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=${LOCAL_TRAIN_TRG}.text \
|
|
|
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
|
|
|
else \
|
|
|
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
|
|
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=${LOCAL_TRAIN_TRG}.text \
|
|
|
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
|
|
|
fi
|
|
|
|
mv $@.model $@
|
|
|
|
ifeq (${GENERATE_SPM_VOC},1)
|
|
|
|
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < ${LOCAL_TRAIN_TRG}.text > $@.voc
|
|
|
|
endif
|
|
|
|
rm -f ${LOCAL_TRAIN_TRG}.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## sentence piece model trained on monolingual data
|
|
|
|
SPMMODEL = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k-model
|
|
|
|
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k-model
|
|
|
|
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k-model
|
|
|
|
|
|
|
|
## vocabulary files created from monolingual data
|
|
|
|
SPMVOCAB = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k.vocab.yml
|
|
|
|
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k.vocab.yml
|
|
|
|
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.vocab.yml
|
|
|
|
|
|
|
|
.PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}
|
|
|
|
|
|
|
|
mono-spm-vocab: ${SPMVOCAB}
|
|
|
|
|
|
|
|
ifneq (${SPMVOCAB},${SPMSRCVOCAB})
|
|
|
|
${SPMSRCVOCAB}:
|
|
|
|
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-vocab
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifneq (${SPMVOCAB},${SPMTRGVOCAB})
|
|
|
|
${SPMTRGVOCAB}:
|
|
|
|
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
|
|
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
${SPMVOCAB}: ${LOCAL_MONO_DATA}.${PRE} ${SPMMODEL}
|
|
|
|
ifeq ($(wildcard ${SPMVOCAB}),)
|
|
|
|
mkdir -p ${dir $@}
|
|
|
|
${SPM_HOME}/spm_encode --model ${SPMMODEL} < $< |\
|
|
|
|
${MARIAN}/marian-vocab --max-size ${VOCABSIZE} > $@
|
|
|
|
else
|
|
|
|
@echo "$@ already exists!"
|
|
|
|
@echo "WARNING! No new vocabulary is created even though the data has changed!"
|
|
|
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
|
|
|
touch $@
|
|
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## sentence piece model trained on monolingual data
|
|
|
|
|
|
|
|
mono-spm-model: ${SPMMODEL}
|
|
|
|
|
|
|
|
ifneq (${SPMMODEL},${SPMSRCMONO})
|
|
|
|
${SPMSRCMONO}:
|
|
|
|
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifneq (${SPMMODEL},${SPMTRGMONO})
|
|
|
|
${SPMTRGMONO}:
|
|
|
|
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
|
|
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
|
|
|
|
ifeq ($(wildcard ${SPMMODEL}),)
|
|
|
|
mkdir -p ${dir $@}
|
|
|
|
grep . $< | ${SHUFFLE} > $<.text
|
|
|
|
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
|
|
|
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
|
|
|
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
|
|
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
|
|
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
|
|
|
else \
|
|
|
|
${SPM_HOME}/spm_train ${SPMEXTRA} \
|
|
|
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
|
|
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
|
|
|
fi
|
|
|
|
mv $@.model $@
|
|
|
|
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
|
|
|
|
rm -f $<.text
|
|
|
|
else
|
|
|
|
@echo "$@ already exists!"
|
|
|
|
@echo "WARNING! No new SPM model created!"
|
|
|
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
|
|
|
endif
|
|
|
|
|
|
|
|
## SentencePiece parameters:
|
|
|
|
##
|
|
|
|
# --input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 10000000
|
|
|
|
# --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true
|
|
|
|
# --training_sentence_size (maximum size of sentences to train sentence pieces) type: int32 default: 10000000
|
|
|
|
# --vocab_size (vocabulary size) type: int32 default: 8000
|
|
|
|
|
|
|
|
|
|
|
|
## character frequence table
|
|
|
|
## --> used to decide about the character coverage level
|
|
|
|
|
|
|
|
## awk-based char-counter
|
|
|
|
#%.charfreq: %
|
|
|
|
# sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
|
|
|
|
# awk '!/^$$/{a[$$0]++}END{for (i in a)print i,a[i];}' > $@
|
|
|
|
|
|
|
|
## python-based char-counter (seems to be the fastest version)
|
|
|
|
%.charfreq: %
|
|
|
|
head -10000000 $< > $<.10m
|
|
|
|
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
|
|
|
rm -f $<.10m
|
|
|
|
|
2020-06-06 20:49:54 +03:00
|
|
|
%.charfreq: %.gz
|
|
|
|
${GZIP} -cd < $< | head -10000000 > $<.10m
|
|
|
|
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
|
|
|
rm -f $<.10m
|
|
|
|
|
|
|
|
|
2020-06-03 15:39:18 +03:00
|
|
|
## slow version
|
|
|
|
%.charfreq2: %
|
|
|
|
head -10000000 $< |\
|
|
|
|
sed 's/./& /g' | \
|
|
|
|
tr ' ' "\n" | grep . |\
|
|
|
|
sort | uniq -c > $@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## TODO: should we have vocab limits?
|
|
|
|
## --vocabulary={vocab_file}.L1 --vocabulary_threshold=50
|
|
|
|
## see https://github.com/google/sentencepiece#c-from-source
|
|
|
|
|
|
|
|
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
2020-06-06 20:49:54 +03:00
|
|
|
ifeq (${USE_TARGET_LABELS},1)
|
2020-06-03 15:39:18 +03:00
|
|
|
cut -f1 -d ' ' $< > $<.labels
|
|
|
|
cut -f2- -d ' ' $< > $<.txt
|
|
|
|
${SPM_HOME}/spm_encode --model $(word 2,$^) < $<.txt > $@.txt
|
|
|
|
paste -d ' ' $<.labels $@.txt > $@
|
|
|
|
rm -f $<.labels $<.txt $@.txt
|
2020-06-06 20:49:54 +03:00
|
|
|
else
|
|
|
|
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
endif
|
|
|
|
|
|
|
|
%.trg.spm${TRGBPESIZE:000=}k: %.trg ${SPMTRGMODEL}
|
|
|
|
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
|
|
|
|
|
|
|
|
|
|
|
## document-level models (with guided alignment)
|
|
|
|
%.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz:
|
|
|
|
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k wordalign
|
|
|
|
./large-context.pl -l ${CONTEXT_SIZE} \
|
|
|
|
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.src.spm${SRCBPESIZE:000=}k.gz,$@} \
|
|
|
|
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.trg.spm${TRGBPESIZE:000=}k.gz,$@} \
|
|
|
|
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.spm${SRCBPESIZE:000=}k-spm${TRGBPESIZE:000=}k.src-trg.alg.gz,$@} \
|
|
|
|
| ${GZIP} > $@.tmp.gz
|
|
|
|
${GZIP} -cd < $@.tmp.gz | cut -f1 | ${GZIP} -c > $@
|
|
|
|
${GZIP} -cd < $@.tmp.gz | cut -f2 | ${GZIP} -c > ${subst .src.,.trg.,$@}
|
|
|
|
${GZIP} -cd < $@.tmp.gz | cut -f3 | \
|
|
|
|
${GZIP} -c > ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,\
|
|
|
|
%.spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE}-spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.src-trg.alg.gz,$@}
|
|
|
|
rm -f $@.tmp.gz
|
|
|
|
|
|
|
|
%.trg.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz: %.src.spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz
|
|
|
|
@echo "done!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## for validation and test data:
|
|
|
|
%.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}:
|
|
|
|
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k devdata
|
|
|
|
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k testdata
|
|
|
|
./large-context.pl -l ${CONTEXT_SIZE} \
|
|
|
|
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.src.spm${SRCBPESIZE:000=}k,$@} \
|
|
|
|
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.trg.spm${TRGBPESIZE:000=}k,$@} \
|
|
|
|
| ${GZIP} > $@.tmp.gz
|
|
|
|
${GZIP} -cd < $@.tmp.gz | cut -f1 > $@
|
|
|
|
${GZIP} -cd < $@.tmp.gz | cut -f2 > ${subst .src.,.trg.,$@}
|
|
|
|
rm -f $@.tmp.gz
|
|
|
|
|
|
|
|
%.trg.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}: %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}
|
|
|
|
@echo "done!"
|
|
|
|
|