mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
vocabs from monolingual and character coverage in sentence piece models depending on size of alphabet
This commit is contained in:
parent
811815064b
commit
8ff98705b7
141
Makefile.data
141
Makefile.data
@ -837,32 +837,28 @@ endif
|
|||||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
||||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
||||||
|
|
||||||
## sentence piece model trained on monolingual data
|
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||||
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
|
|
||||||
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
|
|
||||||
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
|
|
||||||
|
|
||||||
|
|
||||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
|
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.charfreq
|
||||||
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||||
grep . $< > $<.text
|
grep . $< | shuf > $<.text
|
||||||
${SPM_HOME}/spm_train \
|
|
||||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false
|
|
||||||
else
|
else
|
||||||
cut -f2- -d ' ' $< | grep . > $<.text
|
cut -f2- -d ' ' $< | grep . | shuf > $<.text
|
||||||
${SPM_HOME}/spm_train \
|
|
||||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false
|
|
||||||
endif
|
endif
|
||||||
|
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||||
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
|
else \
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||||
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||||
|
fi
|
||||||
mv $@.model $@
|
mv $@.model $@
|
||||||
|
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
|
||||||
rm -f $<.text
|
rm -f $<.text
|
||||||
else
|
else
|
||||||
@echo "$@ already exists!"
|
@echo "$@ already exists!"
|
||||||
@ -870,16 +866,24 @@ else
|
|||||||
@echo "WARNING! Delete the file if you want to start from scratch!"
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
## no labels on the target language side
|
## no labels on the target language side
|
||||||
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
|
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_TRG}.charfreq
|
||||||
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
grep . $< > $<.text
|
grep . $< | shuf > $<.text
|
||||||
${SPM_HOME}/spm_train \
|
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
|
else \
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||||
|
fi
|
||||||
mv $@.model $@
|
mv $@.model $@
|
||||||
|
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
|
||||||
rm -f $<.text
|
rm -f $<.text
|
||||||
else
|
else
|
||||||
@echo "$@ already exists!"
|
@echo "$@ already exists!"
|
||||||
@ -890,24 +894,77 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## sentence piece model trained on monolingual data
|
||||||
|
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
|
||||||
|
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
|
||||||
|
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
|
||||||
|
|
||||||
|
## vocabulary files created from monolingual data
|
||||||
|
SPMVOCAB = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k.vocab.yml
|
||||||
|
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k.vocab.yml
|
||||||
|
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k.vocab.yml
|
||||||
|
|
||||||
|
.PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}
|
||||||
|
|
||||||
|
mono-spm-vocab: ${SPMVOCAB}
|
||||||
|
|
||||||
|
ifneq (${SPMVOCAB},${SPMSRCVOCAB})
|
||||||
|
${SPMSRCVOCAB}:
|
||||||
|
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-vocab
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq (${SPMVOCAB},${SPMTRGVOCAB})
|
||||||
|
${SPMSRCVOCAB}:
|
||||||
|
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
${SPMVOCAB}: ${LOCAL_MONO_DATA}.${PRE} ${SPMMODEL}
|
||||||
|
ifeq ($(wildcard ${SPMVOCAB}),)
|
||||||
|
mkdir -p ${dir $@}
|
||||||
|
${SPM_HOME}/spm_encode --model ${SPMMODEL} < $< |\
|
||||||
|
${MARIAN}/marian-vocab --max-size ${VOCABSIZE} > $@
|
||||||
|
else
|
||||||
|
@echo "$@ already exists!"
|
||||||
|
@echo "WARNING! No new vocabulary is created even though the data has changed!"
|
||||||
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
||||||
|
touch $@
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## sentence piece model trained on monolingual data
|
## sentence piece model trained on monolingual data
|
||||||
|
|
||||||
mono-spm-model: ${SPMMODEL}
|
mono-spm-model: ${SPMMODEL}
|
||||||
|
|
||||||
${SPMSRCMONO}:
|
ifneq (${SPMMODEL},${SPMSRCMONO})
|
||||||
|
${SPMSRCMONO}:
|
||||||
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
|
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
|
||||||
|
endif
|
||||||
|
|
||||||
${SPMTRGMONO}:
|
ifneq (${SPMMODEL},${SPMTRGMONO})
|
||||||
|
${SPMTRGMONO}:
|
||||||
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
|
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
|
||||||
|
endif
|
||||||
|
|
||||||
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
|
|
||||||
|
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||||
ifeq ($(wildcard ${SPMMODEL}),)
|
ifeq ($(wildcard ${SPMMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
grep . $< > $<.text
|
grep . $< | shuf > $<.text
|
||||||
${SPM_HOME}/spm_train \
|
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=1.0 --hard_vocab_limit=false
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
|
else \
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
|
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||||
|
fi
|
||||||
mv $@.model $@
|
mv $@.model $@
|
||||||
|
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
|
||||||
rm -f $<.text
|
rm -f $<.text
|
||||||
else
|
else
|
||||||
@echo "$@ already exists!"
|
@echo "$@ already exists!"
|
||||||
@ -915,8 +972,36 @@ else
|
|||||||
@echo "WARNING! Delete the file if you want to start from scratch!"
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
## SentencePiece parameters:
|
||||||
|
##
|
||||||
|
# --input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 10000000
|
||||||
|
# --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true
|
||||||
|
# --training_sentence_size (maximum size of sentences to train sentence pieces) type: int32 default: 10000000
|
||||||
|
# --vocab_size (vocabulary size) type: int32 default: 8000
|
||||||
|
|
||||||
|
|
||||||
|
## character frequence table
|
||||||
|
## --> used to decide about the character coverage level
|
||||||
|
|
||||||
|
## awk-based char-counter
|
||||||
|
#%.charfreq: %
|
||||||
|
# sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
|
||||||
|
# awk '!/^$$/{a[$$0]++}END{for (i in a)print i,a[i];}' > $@
|
||||||
|
|
||||||
|
## python-based char-counter (seems to be the fastest version)
|
||||||
|
%.charfreq: %
|
||||||
|
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<', 'r').read())))" > $@
|
||||||
|
|
||||||
|
## slow version
|
||||||
|
%.charfreq2: %
|
||||||
|
sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
|
||||||
|
sort | uniq -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## TODO: should we have vocab limits?
|
||||||
|
## --vocabulary={vocab_file}.L1 --vocabulary_threshold=50
|
||||||
|
## see https://github.com/google/sentencepiece#c-from-source
|
||||||
|
|
||||||
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
||||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||||
|
Loading…
Reference in New Issue
Block a user