mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
tokenizer
This commit is contained in:
parent
d62f74dcc3
commit
74625dfc9e
@ -558,6 +558,8 @@ endif
|
||||
|
||||
.PHONY: add-to-local-train-data
|
||||
add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
|
||||
ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
|
||||
ifneq (${wildcard ${CLEAN_TRAIN_TRG}},)
|
||||
ifdef CHECK_TRAINDATA_SIZE
|
||||
@if [ `${GZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${GZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
@ -593,6 +595,7 @@ endif
|
||||
${LABEL_SOURCE_DATA} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
@${GZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} 2>/dev/null \
|
||||
> ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
@touch ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
######################################
|
||||
# SHUFFLE_DATA is set?
|
||||
# --> shuffle data for each langpair
|
||||
@ -630,8 +633,8 @@ else
|
||||
@cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
|
||||
endif
|
||||
@rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
####################
|
||||
|
@ -332,6 +332,33 @@ elg-fin2zle-pivot:
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-fin2zle-trainjob-pbt-pft-bt
|
||||
|
||||
|
||||
elg-new-bigmodels:
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-deu2fin-trainjob-bt
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-spa2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-fra2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-por2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-ita2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-tur2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-ara2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zho2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zls2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zlw2fin-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-spa2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-fra2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-por2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-ita2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-tur2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-ara2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zho2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zls2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zlw2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-bat2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-gmq2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-heb2deu-trainjob
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-vie2deu-trainjob
|
||||
|
||||
|
||||
|
||||
|
||||
elg-continue-missing:
|
||||
for l in deu fra ita por spa; do \
|
||||
|
@ -21,6 +21,8 @@
|
||||
##----------------------------------------------
|
||||
|
||||
spm-models: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
spm-srcmodel: ${SPMSRCMODEL}
|
||||
spm-trgmodel: ${SPMTRGMODEL}
|
||||
|
||||
# SPMEXTRA = --split_by_whitespace=false
|
||||
SPMEXTRA =
|
||||
|
@ -132,6 +132,42 @@ all:
|
||||
|
||||
|
||||
|
||||
|
||||
tatoeba-tokenizer-sizes:
|
||||
for s in 16000 32000 64000 8000 4000; do \
|
||||
${MAKE} SUBWORD_TRGVOCAB_SIZE=$$s tatoeba-tokenizer-langs; \
|
||||
done
|
||||
|
||||
# or rather base 2 sizes?
|
||||
# 16384 32768 65536 8192 4096
|
||||
# (but this breaks the way we create the short size string)
|
||||
|
||||
tatoeba-tokenizer-langs:
|
||||
for l in ${filter-out eng,${TATOEBA_LANGS}}; do \
|
||||
${MAKE} SRC=eng TRG=$$l tatoeba-tokenizer; \
|
||||
done
|
||||
${MAKE} SRC=fra TRG=eng tatoeba-tokenizer
|
||||
|
||||
SUBWORD_TRG_SHORTSIZE = ${SUBWORD_TRGVOCAB_SIZE:000=}k
|
||||
SUBWORD_TRG_NAME = ${lastword ${subst -, ,${LANGPAIRSTR}}}
|
||||
|
||||
tatoeba-tokenizer: ${RELEASEDIR}/spm/${LANGPAIRSTR}/opusTC.${SUBWORD_TRG_NAME}.${SUBWORD_TRG_SHORTSIZE}.spm
|
||||
|
||||
${RELEASEDIR}/spm/${LANGPAIRSTR}/opusTC.${SUBWORD_TRG_NAME}.${SUBWORD_TRG_SHORTSIZE}.spm:
|
||||
mkdir -p ${WORKDIR}/spm/train
|
||||
${MAKE} WORKDIR=${WORKDIR}/spm fetch-datasets
|
||||
${MAKE} WORKDIR=${WORKDIR}/spm langlabel-files
|
||||
${MAKE} WORKDIR=${WORKDIR}/spm SPM_INPUT_SIZE=10000000 spm-trgmodel-tatoeba \
|
||||
> ${WORKDIR}/train/spm-trgmodel.out 2>${WORKDIR}/train/spm-trgmodel.err
|
||||
mkdir -p ${dir $@}
|
||||
mv ${WORKDIR}/spm/train/$(notdir ${SPMTRGMODEL}) $@
|
||||
mv ${WORKDIR}/spm/train/$(notdir ${SPMTRGMODEL}).vocab $@.vocab
|
||||
mv ${WORKDIR}/train/spm-trgmodel.out $@.stdout
|
||||
mv ${WORKDIR}/train/spm-trgmodel.err $@.stderr
|
||||
|
||||
|
||||
|
||||
|
||||
## start unidirectional training job
|
||||
## - make data first, then submit a job
|
||||
.PHONY: tatoeba-job
|
||||
|
Loading…
Reference in New Issue
Block a user