tokenizer

This commit is contained in:
Joerg Tiedemann 2022-05-16 20:43:10 +03:00
parent d62f74dcc3
commit 74625dfc9e
4 changed files with 70 additions and 2 deletions

View File

@ -558,6 +558,8 @@ endif
.PHONY: add-to-local-train-data
add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
ifneq (${wildcard ${CLEAN_TRAIN_TRG}},)
ifdef CHECK_TRAINDATA_SIZE
@if [ `${GZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${GZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
@ -593,6 +595,7 @@ endif
${LABEL_SOURCE_DATA} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
@${GZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} 2>/dev/null \
> ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
@touch ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
######################################
# SHUFFLE_DATA is set?
# --> shuffle data for each langpair
@ -630,8 +633,8 @@ else
@cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
endif
@rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
endif
endif
####################

View File

@ -332,6 +332,33 @@ elg-fin2zle-pivot:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-fin2zle-trainjob-pbt-pft-bt
elg-new-bigmodels:
${MAKE} MODELTYPE=transformer-big tatoeba-deu2fin-trainjob-bt
${MAKE} MODELTYPE=transformer-big tatoeba-spa2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-fra2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-por2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-ita2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-tur2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-ara2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-zho2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-zls2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-zlw2fin-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-spa2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-fra2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-por2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-ita2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-tur2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-ara2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-zho2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-zls2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-zlw2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-bat2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-gmq2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-heb2deu-trainjob
${MAKE} MODELTYPE=transformer-big tatoeba-vie2deu-trainjob
elg-continue-missing:
for l in deu fra ita por spa; do \

View File

@ -21,6 +21,8 @@
##----------------------------------------------
spm-models: ${SPMSRCMODEL} ${SPMTRGMODEL}
spm-srcmodel: ${SPMSRCMODEL}
spm-trgmodel: ${SPMTRGMODEL}
# SPMEXTRA = --split_by_whitespace=false
SPMEXTRA =

View File

@ -132,6 +132,42 @@ all:
tatoeba-tokenizer-sizes:
for s in 16000 32000 64000 8000 4000; do \
${MAKE} SUBWORD_TRGVOCAB_SIZE=$$s tatoeba-tokenizer-langs; \
done
# or rather base 2 sizes?
# 16384 32768 65536 8192 4096
# (but this breaks the way we create the short size string)
tatoeba-tokenizer-langs:
for l in ${filter-out eng,${TATOEBA_LANGS}}; do \
${MAKE} SRC=eng TRG=$$l tatoeba-tokenizer; \
done
${MAKE} SRC=fra TRG=eng tatoeba-tokenizer
SUBWORD_TRG_SHORTSIZE = ${SUBWORD_TRGVOCAB_SIZE:000=}k
SUBWORD_TRG_NAME = ${lastword ${subst -, ,${LANGPAIRSTR}}}
tatoeba-tokenizer: ${RELEASEDIR}/spm/${LANGPAIRSTR}/opusTC.${SUBWORD_TRG_NAME}.${SUBWORD_TRG_SHORTSIZE}.spm
${RELEASEDIR}/spm/${LANGPAIRSTR}/opusTC.${SUBWORD_TRG_NAME}.${SUBWORD_TRG_SHORTSIZE}.spm:
mkdir -p ${WORKDIR}/spm/train
${MAKE} WORKDIR=${WORKDIR}/spm fetch-datasets
${MAKE} WORKDIR=${WORKDIR}/spm langlabel-files
${MAKE} WORKDIR=${WORKDIR}/spm SPM_INPUT_SIZE=10000000 spm-trgmodel-tatoeba \
> ${WORKDIR}/train/spm-trgmodel.out 2>${WORKDIR}/train/spm-trgmodel.err
mkdir -p ${dir $@}
mv ${WORKDIR}/spm/train/$(notdir ${SPMTRGMODEL}) $@
mv ${WORKDIR}/spm/train/$(notdir ${SPMTRGMODEL}).vocab $@.vocab
mv ${WORKDIR}/train/spm-trgmodel.out $@.stdout
mv ${WORKDIR}/train/spm-trgmodel.err $@.stderr
## start unidirectional training job
## - make data first, then submit a job
.PHONY: tatoeba-job