fixing many bugs with tatoeba model recipes

This commit is contained in:
Joerg Tiedemann 2022-02-07 20:55:31 +02:00
parent 8b05bb352a
commit 4cc0ccb18d
10 changed files with 325 additions and 170 deletions

View File

@ -149,7 +149,7 @@ include lib/projects.mk
.PHONY: all
all:
${MAKE} rawdata
${MAKE} ${WORKDIR}/${MODELCONFIG}
${MAKE} local-config
${MAKE} data
${MAKE} train
${MAKE} eval

View File

@ -4,6 +4,52 @@
#
# load model-specific configuration parameters
# if they exist in the work directory
##---------------------------------------------------------------
## default name of the data set (and the model)
##---------------------------------------------------------------
TRAINSET_NAME ?= opus
DATASET ?= ${TRAINSET_NAME}
## various ways of setting the model languages
##
## (1) explicitly set source and target languages, for example:
## SRCLANGS="da no sv" TRGLANGS="fi da"
##
## (2) specify language pairs, for example:
## LANGPAIRS="de-en fi-sv da-es"
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
##
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
ifdef LANGPAIRS
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
endif
## LANGPAISTR is used as a sub-dir in WORKHOME
SPACE := $(empty) $(empty)
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
## default model type
MODELTYPE = transformer-align
MODELCONFIG = ${DATASET}${MODEL_VARIANT}.${MODELTYPE}.mk
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
include ${WORKDIR}/${MODELCONFIG}
endif
## some pre-defined language sets
include ${REPOHOME}lib/langsets.mk
@ -25,10 +71,6 @@ MODELTYPES = transformer \
transformer-tiny11 \
transformer-tiny11-align
## default model type
MODELTYPE = transformer-align
NR = 1
## name of the model-specific configuration file
@ -37,35 +79,18 @@ NR = 1
# MODELCONFIG ?= config.mk
## various ways of setting the model languages
##
## (1) explicitly set source and target languages, for example:
## SRCLANGS="da no sv" TRGLANGS="fi da"
##
## (2) specify language pairs, for example:
## LANGPAIRS="de-en fi-sv da-es"
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
##
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
ifdef LANGPAIRS
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
endif
## set SRC and TRG unless they are specified already
ifneq (${words ${SRCLANGS}},1)
SRC ?= multi
else
SRC = ${SRCLANGS}
SRC ?= ${SRCLANGS}
endif
ifneq (${words ${TRGLANGS}},1)
TRG ?= multi
else
TRG = ${TRGLANGS}
TRG ?= ${TRGLANGS}
endif
@ -96,14 +121,14 @@ SKIP_SAME_LANG ?= 0
## --> is that a problem (would MarianNMT use different random shuffles / epoch?)
##----------------------------------------------------------------------
SHUFFLE_DATA = 1
# DATA_IS_SHUFFLED = 1
SHUFFLE_DATA ?= 1
# DATA_IS_SHUFFLED ?= 1
## devtest data is shuffled by default
SHUFFLE_DEVDATA = 1
SHUFFLE_DEVDATA ?= 1
## shuffle multilingual training data to mix language examples
SHUFFLE_MULTILINGUAL_DATA = 1
SHUFFLE_MULTILINGUAL_DATA ?= 1
##----------------------------------------------------------------------
## set FIT_DATA_SIZE to a specific value to fit the training data
@ -113,7 +138,7 @@ SHUFFLE_MULTILINGUAL_DATA = 1
## the script does both, over- and undersampling
##----------------------------------------------------------------------
# FIT_DATA_SIZE = 100000
# FIT_DATA_SIZE ?= 100000
## similar for the dev data: set FIT_DEVDATA_SIZE to
## balance the size of the devdata for each language pair
@ -149,10 +174,6 @@ SORTLANGS = $(sort ${SRC} ${TRG})
SORTSRC = ${firstword ${SORTLANGS}}
SORTTRG = ${lastword ${SORTLANGS}}
LANGPAIR = ${SORTSRC}-${SORTTRG}
SPACE = $(empty) $(empty)
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
## for monolingual things
@ -360,7 +381,7 @@ endif
VOCABSIZE ?= $$((${SUBWORD_SRCVOCAB_SIZE} + ${SUBWORD_TRGVOCAB_SIZE} + 1000))
## for document-level models
CONTEXT_SIZE = 100
CONTEXT_SIZE ?= 100
## pre-processing/data-cleanup type
@ -371,10 +392,10 @@ CONTEXT_SIZE = 100
## we need those data sets to get the parameters
## for the strict mode
PRE = simple
CLEAN_TRAINDATA_TYPE = strict
CLEAN_DEVDATA_TYPE = strict
CLEAN_TESTDATA_TYPE = clean
PRE ?= simple
CLEAN_TRAINDATA_TYPE ?= strict
CLEAN_DEVDATA_TYPE ?= strict
CLEAN_TESTDATA_TYPE ?= clean
## subword splitting type
@ -382,12 +403,6 @@ PRE_SRC = ${SUBWORDS}${SUBWORD_SRCVOCAB_SIZE:000=}k
PRE_TRG = ${SUBWORDS}${SUBWORD_TRGVOCAB_SIZE:000=}k
##-------------------------------------
## default name of the data set (and the model)
##-------------------------------------
TRAINSET_NAME ?= opus
DATASET ?= ${TRAINSET_NAME}
## dev and test data come from one specific data set
## if we have a bilingual model
@ -406,10 +421,10 @@ TESTSET_NAME ?= opus-test
## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training
## TODO: MODELDIR still in use?
## TODO: SPMDIR still in use? (monolingual sp models)
DATADIR = ${WORKHOME}/data
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
MODELDIR = ${WORKHOME}/models/${LANGPAIRSTR}
SPMDIR = ${WORKHOME}/SentencePieceModels
@ -443,11 +458,19 @@ TEST_SRC ?= ${WORKDIR}/test/${TESTSET_NAME}.src
TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
## model basename and optional sub-dir
## home directories for back and forward translation
BACKTRANS_HOME ?= backtranslate
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
PIVOTTRANS_HOME ?= pivoting
# MODEL_SUBDIR =
# MODEL_VARIANT =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}
## model basename and optional sub-dir
## NR is used to create model ensembles
## NR is also used to generate a seed value for initialisation
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
@ -650,24 +673,12 @@ endif
# load model-specific configuration parameters
# if they exist in the work directory
# MODELCONFIG ?= ${MODEL}.${MODELTYPE}.mk
MODELCONFIG = ${DATASET}${MODEL_VARIANT}.${MODELTYPE}.mk
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
include ${WORKDIR}/${MODELCONFIG}
endif
## make some data size-specific configuration parameters
## TODO: is it OK to delete LOCAL_TRAIN data?
.PHONY: config local-config
config local-config: ${WORKDIR}/${MODELCONFIG}
SMALLEST_TRAINSIZE ?= 10000
SMALL_TRAINSIZE ?= 100000
MEDIUM_TRAINSIZE ?= 500000
@ -761,8 +772,6 @@ ${WORKDIR}/${MODELCONFIG}:
@echo "TESTSET = ${TESTSET}" >> $@
@echo "PRE = ${PRE}" >> $@
@echo "SUBWORDS = ${SUBWORDS}" >> $@
@echo "MODEL_SRCVOCAB = ${MODEL_SRCVOCAB}" >> $@
@echo "MODEL_TRGVOCAB = ${MODEL_TRGVOCAB}" >> $@
ifdef SHUFFLE_DATA
@echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@
endif

View File

@ -3,7 +3,6 @@
TATOEBA_VERSION ?= v2021-08-07
TATOEBA_VERSION_NOHYPHEN = $(subst -,,${TATOEBA_VERSION})
ifeq (${SRCLANGS},)
ifdef SRC
SRCLANGS = ${SRC}
@ -17,34 +16,43 @@ endif
# WORKHOME := ${PWD}/work-tatoeba
SMALLEST_TRAINSIZE = 1000
USE_REST_DEVDATA = 0
DATA_IS_SHUFFLED = 1
DEVSIZE = 5000
TESTSIZE = 10000
DEVMINSIZE = 200
SMALLEST_TRAINSIZE ?= 1000
DEVSIZE ?= 5000
TESTSIZE ?= 10000
DEVMINSIZE ?= 200
# by default skip aligned data of the same language
# don't use anything from dev-data
# don't shuffle data because they are already shuffled
# but shuffle multilingual data to mix languages
SKIP_SAME_LANG = 1
USE_REST_DEVDATA = 0
SHUFFLE_DATA = 0
SHUFFLE_DEVDATA = 1
SHUFFLE_MULTILINGUAL_DATA = 1
DATA_IS_SHUFFLED = 1
## this will be the base name of the model file
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION}
DATASET = ${TATOEBA_DATASET}
TRAINSET = ${TATOEBA_TRAINSET}
DEVSET = ${TATOEBA_DEVSET}
TESTSET = ${TATOEBA_TESTSET}
DEVSET_NAME = ${TATOEBA_DEVSET}
TESTSET_NAME = ${TATOEBA_TESTSET}
TRAINSET_NAME = ${TATOEBA_TRAINSET}
## Tatoeba specific data sets
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION}
## change data set names
## DATASET will also be the base name of the model file
DATASET := ${TATOEBA_DATASET}
TRAINSET := ${TATOEBA_TRAINSET}
DEVSET := ${TATOEBA_DEVSET}
TESTSET := ${TATOEBA_TESTSET}
DEVSET_NAME := ${TATOEBA_DEVSET}
TESTSET_NAME := ${TATOEBA_TESTSET}
TRAINSET_NAME := ${TATOEBA_TRAINSET}
##
BACKTRANS_HOME = ${PWD}/back-translate
FORWARDTRANS_HOME = ${PWD}/forward-translate
MODELSHOME = ${PWD}/models
@ -93,14 +101,14 @@ RELEASED_TATOEBA_DATA_FILE = tatoeba/released-bitexts-${TATOEBA_VERSION}.txt
## also extract all source languages that are available for a give target language
## and vice versa
TATOEBA_RELEASED_DATA := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_AVAILABLE_TRG := ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC := ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
## extract language pairs for a specific subset
TATOEBA_SUBSET := lower
TATOEBA_RELEASED_SUBSET := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_AVAILABLE_SUBSET_TRG := ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC := ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
@ -117,8 +125,8 @@ TATOEBA_LANGS := $(sort $(subst -, ,${TATOEBA_LANGPAIRS}))
## SRCLANGS converted to macro languages used in tatoeba releases
## and all non-available languages filtered out
MACRO_SRCLANGS := $(filter ${sort ${shell iso639 -m -n ${SRCLANGS}}},${TATOEBA_LANGS})
MACRO_TRGLANGS := $(filter ${sort ${shell iso639 -m -n ${TRGLANGS}}},${TATOEBA_LANGS})
MACRO_SRCLANGS = $(filter ${sort ${shell iso639 -m -n ${SRCLANGS}}},${TATOEBA_LANGS})
MACRO_TRGLANGS = $(filter ${sort ${shell iso639 -m -n ${TRGLANGS}}},${TATOEBA_LANGS})
@ -138,14 +146,14 @@ TATOEBA_TRGLABELFILE = ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg
## get source and target languages from the label files
ifneq (${wildcard ${TATOEBA_SRCLABELFILE}},)
TATOEBA_SRCLANGS := ${shell cat ${TATOEBA_SRCLABELFILE}}
TATOEBA_SRCLANGS = ${shell cat ${TATOEBA_SRCLABELFILE}}
else
TATOEBA_SRCLANGS := ${SRCLANGS}
TATOEBA_SRCLANGS = ${SRCLANGS}
endif
ifneq (${wildcard ${TATOEBA_TRGLABELFILE}},)
TATOEBA_TRGLANGS := ${shell cat ${TATOEBA_TRGLABELFILE}}
TATOEBA_TRGLANGS = ${shell cat ${TATOEBA_TRGLABELFILE}}
else
TATOEBA_TRGLANGS := ${TRGLANGS}
TATOEBA_TRGLANGS = ${TRGLANGS}
endif
ifdef TATOEBA_TRGLANGS

View File

@ -55,9 +55,6 @@ endif
## - use only the latest backtranslations
## if such a subdir exists
BACKTRANS_HOME ?= backtranslate
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
PIVOTTRANS_HOME ?= pivoting
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)

View File

@ -46,8 +46,8 @@ endif
ifdef NR_WORDS_RAWSRCTEST
ifdef NR_WORDS_RAWTRGTEST
WORD_RATIO_SRCTRG_RAWTEST = $$(( ${NR_WORDS_RAWSRCTEST} / ${NR_WORDS_RAWTRGTEST} ))
WORD_RATIO_TRGSRC_RAWTEST = $$(( ${NR_WORDS_RAWTRGTEST} / ${NR_WORDS_RAWSRCTEST} ))
WORD_RATIO_SRCTRG_RAWTEST = $$(( (${NR_WORDS_RAWSRCTEST} + 1) / (${NR_WORDS_RAWTRGTEST} + 1) ))
WORD_RATIO_TRGSRC_RAWTEST = $$(( (${NR_WORDS_RAWTRGTEST} + 1) / (${NR_WORDS_RAWSRCTEST} + 1) ))
WORD_RATIO_RAWTEST = ${shell printf "%s\n" ${WORD_RATIO_SRCTRG_RAWTEST} ${WORD_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
WORD_RATIO_THRESHOLD = $$(( ${WORD_RATIO_RAWTEST} + 1 ))
endif
@ -55,8 +55,8 @@ endif
ifdef NR_CHARS_RAWSRCTEST
ifdef NR_CHARS_RAWTRGTEST
CHAR_RATIO_SRCTRG_RAWTEST = $$(( ${NR_CHARS_RAWSRCTEST} / ${NR_CHARS_RAWTRGTEST} ))
CHAR_RATIO_TRGSRC_RAWTEST = $$(( ${NR_CHARS_RAWTRGTEST} / ${NR_CHARS_RAWSRCTEST} ))
CHAR_RATIO_SRCTRG_RAWTEST = $$(( (${NR_CHARS_RAWSRCTEST} + 1) / (${NR_CHARS_RAWTRGTEST} + 1) ))
CHAR_RATIO_TRGSRC_RAWTEST = $$(( (${NR_CHARS_RAWTRGTEST} + 1) / (${NR_CHARS_RAWSRCTEST} + 1) ))
CHAR_RATIO_RAWTEST = ${shell printf "%s\n" ${CHAR_RATIO_SRCTRG_RAWTEST} ${CHAR_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
CHAR_RATIO_THRESHOLD = $$(( ${CHAR_RATIO_RAWTEST} + 1 ))
endif
@ -64,8 +64,8 @@ endif
ifdef UNIQUE_CHARS_RAWSRCTEST
ifdef UNIQUE_CHARS_RAWTRGTEST
CHARSET_RATIO_SRCTRG_RAWTEST = $$(( ${UNIQUE_CHARS_RAWSRCTEST} / ${UNIQUE_CHARS_RAWTRGTEST} ))
CHARSET_RATIO_TRGSRC_RAWTEST = $$(( ${UNIQUE_CHARS_RAWTRGTEST} / ${UNIQUE_CHARS_RAWSRCTEST} ))
CHARSET_RATIO_SRCTRG_RAWTEST = $$(( (${UNIQUE_CHARS_RAWSRCTEST} + 1) / ( ${UNIQUE_CHARS_RAWTRGTEST} + 1) ))
CHARSET_RATIO_TRGSRC_RAWTEST = $$(( (${UNIQUE_CHARS_RAWTRGTEST} + 1) / ( ${UNIQUE_CHARS_RAWSRCTEST} + 1) ))
CHARSET_RATIO_RAWTEST = ${shell printf "%s\n" ${CHARSET_RATIO_SRCTRG_RAWTEST} ${CHARSET_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
CHARSET_RATIO_THRESHOLD = $$(( ${CHARSET_RATIO_RAWTEST} + 1 ))
endif
@ -102,27 +102,57 @@ print_data_thresholds:
STRICT_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
strict-clean-data: ${STRICT_TRAIN_SRC}
%.strict.${SRCEXT}.gz: %.clean.${SRCEXT}.gz
ifdef WORD_RATIO_THRESHOLD
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
if [ -e $< ]; then \
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
-ratio ${WORD_RATIO_THRESHOLD} \
-max-word-length ${LONGEST_WORD_THRESHOLD} \
$(<:.${SRCEXT}.gz=) \
$(SRCEXT) $(TRGEXT) \
$(@:.${SRCEXT}.gz=) \
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
${GZIP} -f $(@:.gz=) $(@:.${SRCEXT}.gz=.${TRGEXT})
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}; \
${GZIP} -f $(@:.gz=) $(@:.${SRCEXT}.gz=.${TRGEXT}); \
fi
else
-ln -s $< $@
-ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz)
-if [ -e $< ]; then \
ln -s $< $@; \
ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz); \
fi
endif
%.strict.${TRGEXT}.gz: %.strict.${SRCEXT}.gz
@echo "done!"
## yet another filter
STRICT2_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict2.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
strict2-clean-data: ${STRICT2_TRAIN_SRC}
%.strict2.${SRCEXT}.gz: %.strict.${SRCEXT}.gz
ifdef CHAR_RATIO_THRESHOLD
if [ -e $< ]; then \
$(SCRIPTDIR)/bitext_filter.pl \
-l ${CHAR_RATIO_THRESHOLD} \
-c ${CHARSET_RATIO_THRESHOLD} \
$(SRCEXT) $(TRGEXT) \
$(<:.${SRCEXT}.gz=) \
$(@:.${SRCEXT}.gz=); \
fi
else
-if [ -e $< ]; then \
ln -s $< $@; \
ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz); \
fi
endif
%.strict2.${TRGEXT}.gz: %.strict2.${SRCEXT}.gz
@echo "done!"
@ -167,10 +197,13 @@ endif
## - line 4: length-of-longest-word
%.stats: %.gz
${GZCAT} $< | wc -lwmc > $@
${GZCAT} $< | sed 's/./& /g' | tr ' ' "\n" | sort -u | wc -l >> $@
${GZCAT} $< | wc -L >> $@
${GZCAT} $< | tr ' ' "\n" | wc -L >> $@
@if [ -e $< ]; then \
echo ".... create some stats for $<"; \
${GZCAT} $< | wc -lwmc > $@; \
${GZCAT} $< | sed 's/./& /g' | tr ' ' "\n" | sort -u | wc -l >> $@; \
${GZCAT} $< | wc -L >> $@; \
${GZCAT} $< | tr ' ' "\n" | wc -L >> $@; \
fi
##----------------------------------------------

View File

@ -40,3 +40,8 @@ elg-eng2all:
for l in ${ELG_EU_SELECTED}; do \
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
done
elg-eng2missing:
for l in est lav ron hbs sqi spa fra ita por zlw ara heb deu fin; do \
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
done

View File

@ -17,20 +17,27 @@ include ${REPOHOME}lib/allas.mk
include ${REPOHOME}lib/dist.mk
#------------------------------------------------------------------------
# create a model-specific config file
#------------------------------------------------------------------------
.PHONY: config local-config
config local-config: ${WORKDIR}/${MODELCONFIG}
#------------------------------------------------------------------------
# make various data sets (and word alignment)
#------------------------------------------------------------------------
.PHONY: data
data:
data:
@${MAKE} rawdata
@${MAKE} ${WORKDIR}/${MODELCONFIG}
@${MAKE} ${TRAINDATA_SRC} ${TRAINDATA_TRG}
@${MAKE} ${DEVDATA_SRC} ${DEVDATA_TRG}
@${MAKE} ${TESTDATA_SRC} ${TESTDATA_TRG}
@${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
@${MAKE} local-config
@${MAKE} traindata
@${MAKE} devdata
@${MAKE} testdata
@${MAKE} vocab
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
@${MAKE} ${TRAIN_ALG}
@${MAKE} wordalign
endif
traindata: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
@ -160,7 +167,7 @@ endif
#------------------------------------------------------------------------
## copy different HPC params for jobs that need to wordalign data or not
ifeq ($(findstring align,${MODELTYPE}),)
ifeq ($(findstring align,${MODELTYPE}),align)
DATAJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS}
ALLJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
else
@ -168,7 +175,6 @@ else
ALLJOB_HPCPARAMS = ${DATA_PREPARE_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
endif
# all-job:
# - check whether data files exist
# - if not: create a CPU job that makes the data and starts a training job after that
@ -202,8 +208,8 @@ ifdef SLURM_JOBID
make ${TRAINJOB_HPCPARAMS} SBATCH_ARGS="-d afterok:${SLURM_JOBID}" train-and-eval.submit${GPUJOB_SUBMIT}
${MAKE} data
else
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}
${MAKE} data
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}
endif

View File

@ -226,42 +226,56 @@ ifneq ($(filter ${LANGPAIR},${TATOEBA_LANGPAIRS}),${LANGPAIR})
else
%/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz:
${MAKE} ${TMPWORKDIR}/$@.d/source.labels ${TMPWORKDIR}/$@.d/target.labels
@if [ `cat ${TMPWORKDIR}/$@.d/source.labels ${TMPWORKDIR}/$@.d/target.labels | wc -w` -gt 1 ]; then \
mkdir -p ${dir $@} ${TMPWORKDIR}/${notdir $@}.d
ln -s ${TMPWORKDIR}/${notdir $@}.d $@.d
@${MAKE} $@.d/source.labels $@.d/target.labels
@if [ `cat $@.d/source.labels $@.d/target.labels | wc -w` -gt 1 ]; then \
echo ".... found sublanguages in the data"; \
b="${TMPWORKDIR}/$@.d/${TATOEBADATA}"; \
for s in `cat ${TMPWORKDIR}/$@.d/source.labels`; do \
for t in `cat ${TMPWORKDIR}/$@.d/target.labels`; do \
if [ "$$s" \< "$$t" ]; then \
echo ".... extract $$s-$$t data"; \
for d in dev test train; do \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.src.gz) <(gzip -cd $$b/$$d.trg.gz) | \
grep -P "^$$s\t$$t\t" > ${TMPWORKDIR}/$@.d/$$d; \
if [ -s ${TMPWORKDIR}/$@.d/$$d ]; then \
cut -f1,2 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz; \
cut -f3 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.gz; \
cut -f4 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.gz; \
fi \
done; \
if [ -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz ]; then \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | \
grep -P "^$$s\t$$t\t" | cut -f3 | \
${GZIP} -c > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz; \
${ZCAT} ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz |\
sort -u > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domains; \
echo "$$s" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.labels; \
echo "$$t" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.labels; \
b="$@.d/${TATOEBADATA}"; \
for s in `cat $@.d/source.labels`; do \
for t in `cat $@.d/target.labels`; do \
echo ".... extract $$s-$$t data"; \
for d in dev test train; do \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.src.gz) <(gzip -cd $$b/$$d.trg.gz) | \
grep -P "^$$s\t$$t\t" > $@.d/$$d; \
if [ -s $@.d/$$d ]; then \
if [ "$$s" \< "$$t" ]; then \
c="${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean"; \
cut -f1,2 $@.d/$$d | ${GZIP} -c > $$c.id.gz; \
else \
c="${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean"; \
cut -f1,2 $@.d/$$d | \
awk ' { t = $$1; $$1 = $$2; $$2 = t; print; } ' FS='\t' OFS='\t' |\
${GZIP} -c > $$c.id.gz; \
fi; \
cut -f3 $@.d/$$d | ${GZIP} -c > $$c.$$s.gz; \
cut -f4 $@.d/$$d | ${GZIP} -c > $$c.$$t.gz; \
fi \
done; \
if [ -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz ]; then \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | \
grep -P "^$$s\t$$t\t" | cut -f3 | \
${GZIP} -c > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz; \
${ZCAT} ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz |\
sort -u > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domains; \
echo "$$s" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.labels; \
echo "$$t" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.labels; \
fi \
done \
done \
fi
## NOTE: always need to copy label files to keep all sublanguages
## --> this is confusing if a sublanguage has the same ID as the macro-language
## --> example: ron includes ron and mol
## --> the label file for ron will include mol but the data files will not
## TODO: can we do that in a better way somehow?
@mv $@.d/source.labels \
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels
@mv $@.d/target.labels \
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels
@if [ ! -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
echo ".... move data files"; \
b="${TMPWORKDIR}/$@.d/${TATOEBADATA}"; \
b="$@.d/${TATOEBADATA}"; \
for d in dev test train; do \
mv $$b/$$d.src.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.gz; \
mv $$b/$$d.trg.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.gz; \
@ -270,13 +284,9 @@ else
${ZCAT} $$b/train.domain.gz | sort -u | tr "\n" ' ' | sed 's/ *$$//' \
> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domains; \
mv $$b/train.domain.gz ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain.gz; \
mv ${TMPWORKDIR}/$@.d/source.labels \
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels; \
mv ${TMPWORKDIR}/$@.d/target.labels \
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels; \
fi
@echo ".... cleanup of temporary files"
@rm -fr ${TMPWORKDIR}/$@.d
@rm -fr ${TMPWORKDIR}/${notdir $@}.d $@.d
endif
@ -332,7 +342,8 @@ endif
@echo ".... fix language codes"
@mkdir -p ${dir $@}${TATOEBADATA}
@if [ -e ${dir $@}${TATOEBADATA}/train.id.gz ]; then \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f2,3 $(FIXLANGIDS) | ${GZIP} -c > ${dir $@}train.id.gz; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f2,3 $(FIXLANGIDS) | \
${GZIP} -c > ${dir $@}train.id.gz; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f1 | ${GZIP} -c > ${dir $@}train.domain.gz; \
mv ${dir $@}train.id.gz ${dir $@}train.domain.gz ${dir $@}${TATOEBADATA}/; \
else \

100
scripts/bitext_filter.pl Executable file
View File

@ -0,0 +1,100 @@
#!/usr/bin/env perl
#
#
# bitext_filter.pl srclang trglang inputbase outputbase
# assumes that data is in
# inputbase.srclang.gz
# inputbase.trglang.gz
use strict;
use Getopt::Std;
use vars qw/$opt_c $opt_l $opt_v/;
getopts('c:l:v');
my $CharLengthRatio = 2 || $opt_l;
my $UniqueCharRatio = 2 || $opt_c;
die "USAGE: bitext_filter.pl srclang trglang inputbase outputbase"
unless ($#ARGV == 3);
my ($src, $trg, $inputbase, $outputbase) = @ARGV;
die "input and output have the same name" if ($inputbase eq $outputbase);
my $inputsrcfile = "$inputbase.$src.gz";
my $inputtrgfile = "$inputbase.$trg.gz";
my $outputsrcfile = "$outputbase.$src.gz";
my $outputtrgfile = "$outputbase.$trg.gz";
open SI,"gzip -cd <$inputsrcfile |" || die "cannot read from $inputsrcfile\n";
open TI,"gzip -cd <$inputtrgfile |" || die "cannot read from $inputtrgfile\n";
open SO,"| gzip -c >$outputsrcfile" || die "cannot read from $inputsrcfile\n";
open TO,"| gzip -c >$outputtrgfile" || die "cannot read from $inputtrgfile\n";
binmode(SI,":utf8");
binmode(TI,":utf8");
binmode(SO,":utf8");
binmode(TO,":utf8");
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");
my $count = 0;
my $skipped = 0;
my $skippedLength = 0;
my $skippedAlphabet = 0;
while (my $s = <SI>){
$count++;
unless ($opt_v){
print STDERR '.' unless ($count % 100000);
print STDERR " $count\n" unless ($count % 5000000);
}
my $t = <TI>;
my $sl = length($s);
my $tl = length($t);
unless ($sl && $tl){
$skipped++;
next;
}
my $LengthRatio = $sl > $tl ? $sl/$tl : $tl/$sl;
if ($LengthRatio > $CharLengthRatio){
print STDERR "skip line $count (length ratio $LengthRatio > $CharLengthRatio)\n" if ($opt_v);
$skipped++;
$skippedLength++;
next;
}
my %s = ();
my %t = ();
map { $s{$_}++ } split(//,$s);
map { $t{$_}++ } split(//,$t);
my $sa = scalar keys %s;
my $ta = scalar keys %t;
my $AlphabetRatio = $sa > $ta ? $sa/$ta : $ta/$sa;
if ( $AlphabetRatio > $UniqueCharRatio ){
print STDERR "skip line $count (unique char ratio $AlphabetRatio > $UniqueCharRatio\n" if ($opt_v);
$skipped++;
$skippedAlphabet++;
next;
}
print SO $s;
print TO $t;
}
print "\noriginal: $count\n";
print "skipped : $skipped\n";
print " skipped because of character length ratio: $skippedLength\n";
print " skipped because of alphabet size ratio : $skippedAlphabet\n";
print "retained: ",$count-$skipped,"\n";

View File

@ -108,21 +108,10 @@ REPOHOME := ${PWD}/../
# TATOEBA_VERSION = v2020-07-28
TATOEBA_VERSION = v2021-08-07
SMALLEST_TRAINSIZE = 1000
USE_REST_DEVDATA = 0
DATA_IS_SHUFFLED = 1
DEVSIZE = 5000
TESTSIZE = 10000
DEVMINSIZE = 200
# by default skip aligned data of the same language
SKIP_SAME_LANG = 1
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/config/tatoeba.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/tasks.mk
include ${REPOHOME}lib/tasks/tatoeba/data.mk
include ${REPOHOME}lib/tasks/tatoeba/tune.mk
@ -133,11 +122,9 @@ include ${REPOHOME}lib/projects/elg.mk
.PHONY: all
all:
${MAKE} prepare
${MAKE} tatoeba-prepare
${MAKE} data-tatoeba
${MAKE} train-tatoeba
${MAKE} eval-tatoeba
@ -188,7 +175,6 @@ prepare tatoeba-prepare: ${TATOEBA_LANGIDS_TRAINONLY}
prepare-and-data tatoeba-prepare-and-data: ${TATOEBA_LANGIDS_TRAINONLY}
${MAKE} fetch-datasets
${MAKE} langlabel-files
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba