vocabs from monolingual and character coverage in sentence piece models depending on size of alphabet

2024-09-11 20:27:19 +03:00 · 2020-02-08 18:10:38 +02:00 · 2020-02-08 18:10:38 +02:00 · 8ff98705b7
commit 8ff98705b7
parent 811815064b
1 changed files with 113 additions and 28 deletions
--- a/Makefile.data
+++ b/Makefile.data
@ -837,32 +837,28 @@ endif
 SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
 SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
-## sentence piece model trained on monolingual data
+.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
 SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
 SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
 SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
 .PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO}
 # ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
-${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
+${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.charfreq
 ifeq ($(wildcard ${SPMSRCMODEL}),)
 	mkdir -p ${dir $@}
 ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
-	grep . $< > $<.text
+	grep . $< | shuf > $<.text
 	${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
 		--character_coverage=1.0 --hard_vocab_limit=false
 else
-	cut -f2- -d ' ' $< | grep . > $<.text
+	cut -f2- -d ' ' $< | grep . | shuf > $<.text
 	${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
 		--character_coverage=1.0 --hard_vocab_limit=false
 endif
 	if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
 	  ${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
 		--character_coverage=0.9995 --hard_vocab_limit=false; \
 	else \
 	  ${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
 		--character_coverage=1.0 --hard_vocab_limit=false; \
 	fi
 	mv $@.model $@
 	${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
 	rm -f $<.text
 else
 	@echo "$@ already exists!"
@ -870,16 +866,24 @@ else
 	@echo "WARNING! Delete the file if you want to start from scratch!"
 endif
 ## no labels on the target language side
 # ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
-${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
+${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_TRG}.charfreq
 ifeq ($(wildcard ${SPMTRGMODEL}),)
 	mkdir -p ${dir $@}
-	grep . $< > $<.text
+	grep . $< | shuf > $<.text
-	${SPM_HOME}/spm_train \
+	if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
 	  ${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
-		--character_coverage=1.0 --hard_vocab_limit=false
+		--character_coverage=0.9995 --hard_vocab_limit=false; \
 	else \
 	  ${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
 		--character_coverage=1.0 --hard_vocab_limit=false; \
 	fi
 	mv $@.model $@
 	${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
 	rm -f $<.text
 else
 	@echo "$@ already exists!"
@ -890,24 +894,77 @@ endif
 ## sentence piece model trained on monolingual data
 SPMMODEL   = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
 SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
 SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
 ## vocabulary files created from monolingual data
 SPMVOCAB    = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k.vocab.yml
 SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k.vocab.yml
 SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k.vocab.yml
 .PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}
 mono-spm-vocab: ${SPMVOCAB}
 ifneq (${SPMVOCAB},${SPMSRCVOCAB})
  ${SPMSRCVOCAB}:
 	${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-vocab
 endif
 ifneq (${SPMVOCAB},${SPMTRGVOCAB})
  ${SPMSRCVOCAB}:
 	${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
 endif
 ${SPMVOCAB}: ${LOCAL_MONO_DATA}.${PRE} ${SPMMODEL}
 ifeq ($(wildcard ${SPMVOCAB}),)
 	mkdir -p ${dir $@}
 	${SPM_HOME}/spm_encode --model ${SPMMODEL} < $< |\
 	${MARIAN}/marian-vocab --max-size ${VOCABSIZE} > $@
 else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new vocabulary is created even though the data has changed!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
 	touch $@
 endif
 ## sentence piece model trained on monolingual data
 mono-spm-model: ${SPMMODEL}
-${SPMSRCMONO}:
+ifneq (${SPMMODEL},${SPMSRCMONO})
  ${SPMSRCMONO}:
 	${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
 endif
-${SPMTRGMONO}:
+ifneq (${SPMMODEL},${SPMTRGMONO})
  ${SPMTRGMONO}:
 	${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
 endif
-${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
+
 ${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
 ifeq ($(wildcard ${SPMMODEL}),)
 	mkdir -p ${dir $@}
-	grep . $< > $<.text
+	grep . $< | shuf > $<.text
-	${SPM_HOME}/spm_train \
+	if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
 	  ${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
-		--character_coverage=1.0 --hard_vocab_limit=false
+		--character_coverage=0.9995 --hard_vocab_limit=false; \
 	else \
 	  ${SPM_HOME}/spm_train \
 		--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
 		--character_coverage=1.0 --hard_vocab_limit=false; \
 	fi
 	mv $@.model $@
 	${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < $<.text > $@.voc
 	rm -f $<.text
 else
 	@echo "$@ already exists!"
@ -915,8 +972,36 @@ else
 	@echo "WARNING! Delete the file if you want to start from scratch!"
 endif
 ## SentencePiece parameters:
 ##
 # --input_sentence_size (maximum size of sentences the trainer loads)  type: int32  default: 10000000
 # --hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.)  type: bool  default: true
 # --training_sentence_size (maximum size of sentences to train sentence pieces)  type: int32  default: 10000000
 # --vocab_size (vocabulary size)  type: int32  default: 8000
 ## character frequence table
 ## --> used to decide about the character coverage level
 ## awk-based char-counter
 #%.charfreq: %
 #	sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
 #	awk '!/^$$/{a[$$0]++}END{for (i in a)print i,a[i];}' > $@
 ## python-based char-counter (seems to be the fastest version)
 %.charfreq: %
 	python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<', 'r').read())))" > $@
 ## slow version
 %.charfreq2: %
 	sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
 	sort | uniq -c > $@
 ## TODO: should we have vocab limits?
 ## --vocabulary={vocab_file}.L1 --vocabulary_threshold=50
 ## see https://github.com/google/sentencepiece#c-from-source
 %.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
 ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})