mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-03 23:57:47 +03:00
fixing many bugs with tatoeba model recipes
This commit is contained in:
parent
8b05bb352a
commit
4cc0ccb18d
2
Makefile
2
Makefile
@ -149,7 +149,7 @@ include lib/projects.mk
|
||||
.PHONY: all
|
||||
all:
|
||||
${MAKE} rawdata
|
||||
${MAKE} ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} local-config
|
||||
${MAKE} data
|
||||
${MAKE} train
|
||||
${MAKE} eval
|
||||
|
135
lib/config.mk
135
lib/config.mk
@ -4,6 +4,52 @@
|
||||
#
|
||||
|
||||
|
||||
# load model-specific configuration parameters
|
||||
# if they exist in the work directory
|
||||
|
||||
##---------------------------------------------------------------
|
||||
## default name of the data set (and the model)
|
||||
##---------------------------------------------------------------
|
||||
|
||||
TRAINSET_NAME ?= opus
|
||||
DATASET ?= ${TRAINSET_NAME}
|
||||
|
||||
## various ways of setting the model languages
|
||||
##
|
||||
## (1) explicitly set source and target languages, for example:
|
||||
## SRCLANGS="da no sv" TRGLANGS="fi da"
|
||||
##
|
||||
## (2) specify language pairs, for example:
|
||||
## LANGPAIRS="de-en fi-sv da-es"
|
||||
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
|
||||
##
|
||||
|
||||
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
|
||||
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
|
||||
ifdef LANGPAIRS
|
||||
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
|
||||
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
|
||||
endif
|
||||
|
||||
|
||||
## LANGPAISTR is used as a sub-dir in WORKHOME
|
||||
SPACE := $(empty) $(empty)
|
||||
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
|
||||
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
|
||||
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
|
||||
|
||||
## default model type
|
||||
MODELTYPE = transformer-align
|
||||
|
||||
|
||||
|
||||
MODELCONFIG = ${DATASET}${MODEL_VARIANT}.${MODELTYPE}.mk
|
||||
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
|
||||
include ${WORKDIR}/${MODELCONFIG}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## some pre-defined language sets
|
||||
include ${REPOHOME}lib/langsets.mk
|
||||
@ -25,10 +71,6 @@ MODELTYPES = transformer \
|
||||
transformer-tiny11 \
|
||||
transformer-tiny11-align
|
||||
|
||||
## default model type
|
||||
|
||||
MODELTYPE = transformer-align
|
||||
NR = 1
|
||||
|
||||
|
||||
## name of the model-specific configuration file
|
||||
@ -37,35 +79,18 @@ NR = 1
|
||||
# MODELCONFIG ?= config.mk
|
||||
|
||||
|
||||
## various ways of setting the model languages
|
||||
##
|
||||
## (1) explicitly set source and target languages, for example:
|
||||
## SRCLANGS="da no sv" TRGLANGS="fi da"
|
||||
##
|
||||
## (2) specify language pairs, for example:
|
||||
## LANGPAIRS="de-en fi-sv da-es"
|
||||
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
|
||||
##
|
||||
|
||||
|
||||
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
|
||||
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
|
||||
ifdef LANGPAIRS
|
||||
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
|
||||
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
|
||||
endif
|
||||
|
||||
|
||||
## set SRC and TRG unless they are specified already
|
||||
ifneq (${words ${SRCLANGS}},1)
|
||||
SRC ?= multi
|
||||
else
|
||||
SRC = ${SRCLANGS}
|
||||
SRC ?= ${SRCLANGS}
|
||||
endif
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
TRG ?= multi
|
||||
else
|
||||
TRG = ${TRGLANGS}
|
||||
TRG ?= ${TRGLANGS}
|
||||
endif
|
||||
|
||||
|
||||
@ -96,14 +121,14 @@ SKIP_SAME_LANG ?= 0
|
||||
## --> is that a problem (would MarianNMT use different random shuffles / epoch?)
|
||||
##----------------------------------------------------------------------
|
||||
|
||||
SHUFFLE_DATA = 1
|
||||
# DATA_IS_SHUFFLED = 1
|
||||
SHUFFLE_DATA ?= 1
|
||||
# DATA_IS_SHUFFLED ?= 1
|
||||
|
||||
## devtest data is shuffled by default
|
||||
SHUFFLE_DEVDATA = 1
|
||||
SHUFFLE_DEVDATA ?= 1
|
||||
|
||||
## shuffle multilingual training data to mix language examples
|
||||
SHUFFLE_MULTILINGUAL_DATA = 1
|
||||
SHUFFLE_MULTILINGUAL_DATA ?= 1
|
||||
|
||||
##----------------------------------------------------------------------
|
||||
## set FIT_DATA_SIZE to a specific value to fit the training data
|
||||
@ -113,7 +138,7 @@ SHUFFLE_MULTILINGUAL_DATA = 1
|
||||
## the script does both, over- and undersampling
|
||||
##----------------------------------------------------------------------
|
||||
|
||||
# FIT_DATA_SIZE = 100000
|
||||
# FIT_DATA_SIZE ?= 100000
|
||||
|
||||
## similar for the dev data: set FIT_DEVDATA_SIZE to
|
||||
## balance the size of the devdata for each language pair
|
||||
@ -149,10 +174,6 @@ SORTLANGS = $(sort ${SRC} ${TRG})
|
||||
SORTSRC = ${firstword ${SORTLANGS}}
|
||||
SORTTRG = ${lastword ${SORTLANGS}}
|
||||
LANGPAIR = ${SORTSRC}-${SORTTRG}
|
||||
SPACE = $(empty) $(empty)
|
||||
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
|
||||
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
|
||||
|
||||
|
||||
## for monolingual things
|
||||
@ -360,7 +381,7 @@ endif
|
||||
VOCABSIZE ?= $$((${SUBWORD_SRCVOCAB_SIZE} + ${SUBWORD_TRGVOCAB_SIZE} + 1000))
|
||||
|
||||
## for document-level models
|
||||
CONTEXT_SIZE = 100
|
||||
CONTEXT_SIZE ?= 100
|
||||
|
||||
|
||||
## pre-processing/data-cleanup type
|
||||
@ -371,10 +392,10 @@ CONTEXT_SIZE = 100
|
||||
## we need those data sets to get the parameters
|
||||
## for the strict mode
|
||||
|
||||
PRE = simple
|
||||
CLEAN_TRAINDATA_TYPE = strict
|
||||
CLEAN_DEVDATA_TYPE = strict
|
||||
CLEAN_TESTDATA_TYPE = clean
|
||||
PRE ?= simple
|
||||
CLEAN_TRAINDATA_TYPE ?= strict
|
||||
CLEAN_DEVDATA_TYPE ?= strict
|
||||
CLEAN_TESTDATA_TYPE ?= clean
|
||||
|
||||
|
||||
## subword splitting type
|
||||
@ -382,12 +403,6 @@ PRE_SRC = ${SUBWORDS}${SUBWORD_SRCVOCAB_SIZE:000=}k
|
||||
PRE_TRG = ${SUBWORDS}${SUBWORD_TRGVOCAB_SIZE:000=}k
|
||||
|
||||
|
||||
##-------------------------------------
|
||||
## default name of the data set (and the model)
|
||||
##-------------------------------------
|
||||
|
||||
TRAINSET_NAME ?= opus
|
||||
DATASET ?= ${TRAINSET_NAME}
|
||||
|
||||
## dev and test data come from one specific data set
|
||||
## if we have a bilingual model
|
||||
@ -406,10 +421,10 @@ TESTSET_NAME ?= opus-test
|
||||
|
||||
|
||||
## DATADIR = directory where the train/dev/test data are
|
||||
## WORKDIR = directory used for training
|
||||
## TODO: MODELDIR still in use?
|
||||
## TODO: SPMDIR still in use? (monolingual sp models)
|
||||
|
||||
DATADIR = ${WORKHOME}/data
|
||||
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
|
||||
MODELDIR = ${WORKHOME}/models/${LANGPAIRSTR}
|
||||
SPMDIR = ${WORKHOME}/SentencePieceModels
|
||||
|
||||
@ -443,11 +458,19 @@ TEST_SRC ?= ${WORKDIR}/test/${TESTSET_NAME}.src
|
||||
TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
|
||||
|
||||
|
||||
## model basename and optional sub-dir
|
||||
## home directories for back and forward translation
|
||||
BACKTRANS_HOME ?= backtranslate
|
||||
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
|
||||
PIVOTTRANS_HOME ?= pivoting
|
||||
|
||||
# MODEL_SUBDIR =
|
||||
# MODEL_VARIANT =
|
||||
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}
|
||||
|
||||
|
||||
## model basename and optional sub-dir
|
||||
## NR is used to create model ensembles
|
||||
## NR is also used to generate a seed value for initialisation
|
||||
|
||||
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}
|
||||
NR = 1
|
||||
|
||||
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
|
||||
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
|
||||
@ -650,24 +673,12 @@ endif
|
||||
|
||||
|
||||
|
||||
# load model-specific configuration parameters
|
||||
# if they exist in the work directory
|
||||
|
||||
# MODELCONFIG ?= ${MODEL}.${MODELTYPE}.mk
|
||||
MODELCONFIG = ${DATASET}${MODEL_VARIANT}.${MODELTYPE}.mk
|
||||
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
|
||||
include ${WORKDIR}/${MODELCONFIG}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
## make some data size-specific configuration parameters
|
||||
## TODO: is it OK to delete LOCAL_TRAIN data?
|
||||
|
||||
.PHONY: config local-config
|
||||
config local-config: ${WORKDIR}/${MODELCONFIG}
|
||||
|
||||
SMALLEST_TRAINSIZE ?= 10000
|
||||
SMALL_TRAINSIZE ?= 100000
|
||||
MEDIUM_TRAINSIZE ?= 500000
|
||||
@ -761,8 +772,6 @@ ${WORKDIR}/${MODELCONFIG}:
|
||||
@echo "TESTSET = ${TESTSET}" >> $@
|
||||
@echo "PRE = ${PRE}" >> $@
|
||||
@echo "SUBWORDS = ${SUBWORDS}" >> $@
|
||||
@echo "MODEL_SRCVOCAB = ${MODEL_SRCVOCAB}" >> $@
|
||||
@echo "MODEL_TRGVOCAB = ${MODEL_TRGVOCAB}" >> $@
|
||||
ifdef SHUFFLE_DATA
|
||||
@echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@
|
||||
endif
|
||||
|
@ -3,7 +3,6 @@
|
||||
TATOEBA_VERSION ?= v2021-08-07
|
||||
TATOEBA_VERSION_NOHYPHEN = $(subst -,,${TATOEBA_VERSION})
|
||||
|
||||
|
||||
ifeq (${SRCLANGS},)
|
||||
ifdef SRC
|
||||
SRCLANGS = ${SRC}
|
||||
@ -17,34 +16,43 @@ endif
|
||||
|
||||
# WORKHOME := ${PWD}/work-tatoeba
|
||||
|
||||
SMALLEST_TRAINSIZE = 1000
|
||||
USE_REST_DEVDATA = 0
|
||||
DATA_IS_SHUFFLED = 1
|
||||
DEVSIZE = 5000
|
||||
TESTSIZE = 10000
|
||||
DEVMINSIZE = 200
|
||||
|
||||
SMALLEST_TRAINSIZE ?= 1000
|
||||
DEVSIZE ?= 5000
|
||||
TESTSIZE ?= 10000
|
||||
DEVMINSIZE ?= 200
|
||||
# by default skip aligned data of the same language
|
||||
# don't use anything from dev-data
|
||||
# don't shuffle data because they are already shuffled
|
||||
# but shuffle multilingual data to mix languages
|
||||
|
||||
SKIP_SAME_LANG = 1
|
||||
USE_REST_DEVDATA = 0
|
||||
SHUFFLE_DATA = 0
|
||||
SHUFFLE_DEVDATA = 1
|
||||
SHUFFLE_MULTILINGUAL_DATA = 1
|
||||
DATA_IS_SHUFFLED = 1
|
||||
|
||||
## this will be the base name of the model file
|
||||
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
|
||||
|
||||
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
|
||||
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
|
||||
TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION}
|
||||
|
||||
DATASET = ${TATOEBA_DATASET}
|
||||
TRAINSET = ${TATOEBA_TRAINSET}
|
||||
DEVSET = ${TATOEBA_DEVSET}
|
||||
TESTSET = ${TATOEBA_TESTSET}
|
||||
DEVSET_NAME = ${TATOEBA_DEVSET}
|
||||
TESTSET_NAME = ${TATOEBA_TESTSET}
|
||||
TRAINSET_NAME = ${TATOEBA_TRAINSET}
|
||||
## Tatoeba specific data sets
|
||||
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
|
||||
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
|
||||
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
|
||||
TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION}
|
||||
|
||||
## change data set names
|
||||
## DATASET will also be the base name of the model file
|
||||
DATASET := ${TATOEBA_DATASET}
|
||||
TRAINSET := ${TATOEBA_TRAINSET}
|
||||
DEVSET := ${TATOEBA_DEVSET}
|
||||
TESTSET := ${TATOEBA_TESTSET}
|
||||
DEVSET_NAME := ${TATOEBA_DEVSET}
|
||||
TESTSET_NAME := ${TATOEBA_TESTSET}
|
||||
TRAINSET_NAME := ${TATOEBA_TRAINSET}
|
||||
|
||||
##
|
||||
BACKTRANS_HOME = ${PWD}/back-translate
|
||||
FORWARDTRANS_HOME = ${PWD}/forward-translate
|
||||
MODELSHOME = ${PWD}/models
|
||||
@ -93,14 +101,14 @@ RELEASED_TATOEBA_DATA_FILE = tatoeba/released-bitexts-${TATOEBA_VERSION}.txt
|
||||
## also extract all source languages that are available for a give target language
|
||||
## and vice versa
|
||||
TATOEBA_RELEASED_DATA := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
|
||||
TATOEBA_AVAILABLE_TRG := ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
|
||||
TATOEBA_AVAILABLE_SRC := ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
|
||||
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
|
||||
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
|
||||
|
||||
## extract language pairs for a specific subset
|
||||
TATOEBA_SUBSET := lower
|
||||
TATOEBA_RELEASED_SUBSET := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
|
||||
TATOEBA_AVAILABLE_SUBSET_TRG := ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
|
||||
TATOEBA_AVAILABLE_SUBSET_SRC := ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
|
||||
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
|
||||
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
|
||||
|
||||
|
||||
|
||||
@ -117,8 +125,8 @@ TATOEBA_LANGS := $(sort $(subst -, ,${TATOEBA_LANGPAIRS}))
|
||||
|
||||
## SRCLANGS converted to macro languages used in tatoeba releases
|
||||
## and all non-available languages filtered out
|
||||
MACRO_SRCLANGS := $(filter ${sort ${shell iso639 -m -n ${SRCLANGS}}},${TATOEBA_LANGS})
|
||||
MACRO_TRGLANGS := $(filter ${sort ${shell iso639 -m -n ${TRGLANGS}}},${TATOEBA_LANGS})
|
||||
MACRO_SRCLANGS = $(filter ${sort ${shell iso639 -m -n ${SRCLANGS}}},${TATOEBA_LANGS})
|
||||
MACRO_TRGLANGS = $(filter ${sort ${shell iso639 -m -n ${TRGLANGS}}},${TATOEBA_LANGS})
|
||||
|
||||
|
||||
|
||||
@ -138,14 +146,14 @@ TATOEBA_TRGLABELFILE = ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg
|
||||
## get source and target languages from the label files
|
||||
|
||||
ifneq (${wildcard ${TATOEBA_SRCLABELFILE}},)
|
||||
TATOEBA_SRCLANGS := ${shell cat ${TATOEBA_SRCLABELFILE}}
|
||||
TATOEBA_SRCLANGS = ${shell cat ${TATOEBA_SRCLABELFILE}}
|
||||
else
|
||||
TATOEBA_SRCLANGS := ${SRCLANGS}
|
||||
TATOEBA_SRCLANGS = ${SRCLANGS}
|
||||
endif
|
||||
ifneq (${wildcard ${TATOEBA_TRGLABELFILE}},)
|
||||
TATOEBA_TRGLANGS := ${shell cat ${TATOEBA_TRGLABELFILE}}
|
||||
TATOEBA_TRGLANGS = ${shell cat ${TATOEBA_TRGLABELFILE}}
|
||||
else
|
||||
TATOEBA_TRGLANGS := ${TRGLANGS}
|
||||
TATOEBA_TRGLANGS = ${TRGLANGS}
|
||||
endif
|
||||
|
||||
ifdef TATOEBA_TRGLANGS
|
||||
|
@ -55,9 +55,6 @@ endif
|
||||
## - use only the latest backtranslations
|
||||
## if such a subdir exists
|
||||
|
||||
BACKTRANS_HOME ?= backtranslate
|
||||
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
|
||||
PIVOTTRANS_HOME ?= pivoting
|
||||
|
||||
|
||||
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)
|
||||
|
@ -46,8 +46,8 @@ endif
|
||||
|
||||
ifdef NR_WORDS_RAWSRCTEST
|
||||
ifdef NR_WORDS_RAWTRGTEST
|
||||
WORD_RATIO_SRCTRG_RAWTEST = $$(( ${NR_WORDS_RAWSRCTEST} / ${NR_WORDS_RAWTRGTEST} ))
|
||||
WORD_RATIO_TRGSRC_RAWTEST = $$(( ${NR_WORDS_RAWTRGTEST} / ${NR_WORDS_RAWSRCTEST} ))
|
||||
WORD_RATIO_SRCTRG_RAWTEST = $$(( (${NR_WORDS_RAWSRCTEST} + 1) / (${NR_WORDS_RAWTRGTEST} + 1) ))
|
||||
WORD_RATIO_TRGSRC_RAWTEST = $$(( (${NR_WORDS_RAWTRGTEST} + 1) / (${NR_WORDS_RAWSRCTEST} + 1) ))
|
||||
WORD_RATIO_RAWTEST = ${shell printf "%s\n" ${WORD_RATIO_SRCTRG_RAWTEST} ${WORD_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
|
||||
WORD_RATIO_THRESHOLD = $$(( ${WORD_RATIO_RAWTEST} + 1 ))
|
||||
endif
|
||||
@ -55,8 +55,8 @@ endif
|
||||
|
||||
ifdef NR_CHARS_RAWSRCTEST
|
||||
ifdef NR_CHARS_RAWTRGTEST
|
||||
CHAR_RATIO_SRCTRG_RAWTEST = $$(( ${NR_CHARS_RAWSRCTEST} / ${NR_CHARS_RAWTRGTEST} ))
|
||||
CHAR_RATIO_TRGSRC_RAWTEST = $$(( ${NR_CHARS_RAWTRGTEST} / ${NR_CHARS_RAWSRCTEST} ))
|
||||
CHAR_RATIO_SRCTRG_RAWTEST = $$(( (${NR_CHARS_RAWSRCTEST} + 1) / (${NR_CHARS_RAWTRGTEST} + 1) ))
|
||||
CHAR_RATIO_TRGSRC_RAWTEST = $$(( (${NR_CHARS_RAWTRGTEST} + 1) / (${NR_CHARS_RAWSRCTEST} + 1) ))
|
||||
CHAR_RATIO_RAWTEST = ${shell printf "%s\n" ${CHAR_RATIO_SRCTRG_RAWTEST} ${CHAR_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
|
||||
CHAR_RATIO_THRESHOLD = $$(( ${CHAR_RATIO_RAWTEST} + 1 ))
|
||||
endif
|
||||
@ -64,8 +64,8 @@ endif
|
||||
|
||||
ifdef UNIQUE_CHARS_RAWSRCTEST
|
||||
ifdef UNIQUE_CHARS_RAWTRGTEST
|
||||
CHARSET_RATIO_SRCTRG_RAWTEST = $$(( ${UNIQUE_CHARS_RAWSRCTEST} / ${UNIQUE_CHARS_RAWTRGTEST} ))
|
||||
CHARSET_RATIO_TRGSRC_RAWTEST = $$(( ${UNIQUE_CHARS_RAWTRGTEST} / ${UNIQUE_CHARS_RAWSRCTEST} ))
|
||||
CHARSET_RATIO_SRCTRG_RAWTEST = $$(( (${UNIQUE_CHARS_RAWSRCTEST} + 1) / ( ${UNIQUE_CHARS_RAWTRGTEST} + 1) ))
|
||||
CHARSET_RATIO_TRGSRC_RAWTEST = $$(( (${UNIQUE_CHARS_RAWTRGTEST} + 1) / ( ${UNIQUE_CHARS_RAWSRCTEST} + 1) ))
|
||||
CHARSET_RATIO_RAWTEST = ${shell printf "%s\n" ${CHARSET_RATIO_SRCTRG_RAWTEST} ${CHARSET_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
|
||||
CHARSET_RATIO_THRESHOLD = $$(( ${CHARSET_RATIO_RAWTEST} + 1 ))
|
||||
endif
|
||||
@ -102,27 +102,57 @@ print_data_thresholds:
|
||||
|
||||
STRICT_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
|
||||
|
||||
|
||||
strict-clean-data: ${STRICT_TRAIN_SRC}
|
||||
|
||||
%.strict.${SRCEXT}.gz: %.clean.${SRCEXT}.gz
|
||||
ifdef WORD_RATIO_THRESHOLD
|
||||
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
|
||||
if [ -e $< ]; then \
|
||||
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
|
||||
-ratio ${WORD_RATIO_THRESHOLD} \
|
||||
-max-word-length ${LONGEST_WORD_THRESHOLD} \
|
||||
$(<:.${SRCEXT}.gz=) \
|
||||
$(SRCEXT) $(TRGEXT) \
|
||||
$(@:.${SRCEXT}.gz=) \
|
||||
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
|
||||
${GZIP} -f $(@:.gz=) $(@:.${SRCEXT}.gz=.${TRGEXT})
|
||||
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}; \
|
||||
${GZIP} -f $(@:.gz=) $(@:.${SRCEXT}.gz=.${TRGEXT}); \
|
||||
fi
|
||||
else
|
||||
-ln -s $< $@
|
||||
-ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz)
|
||||
-if [ -e $< ]; then \
|
||||
ln -s $< $@; \
|
||||
ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz); \
|
||||
fi
|
||||
endif
|
||||
|
||||
%.strict.${TRGEXT}.gz: %.strict.${SRCEXT}.gz
|
||||
@echo "done!"
|
||||
|
||||
|
||||
## yet another filter
|
||||
|
||||
STRICT2_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict2.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
|
||||
strict2-clean-data: ${STRICT2_TRAIN_SRC}
|
||||
|
||||
%.strict2.${SRCEXT}.gz: %.strict.${SRCEXT}.gz
|
||||
ifdef CHAR_RATIO_THRESHOLD
|
||||
if [ -e $< ]; then \
|
||||
$(SCRIPTDIR)/bitext_filter.pl \
|
||||
-l ${CHAR_RATIO_THRESHOLD} \
|
||||
-c ${CHARSET_RATIO_THRESHOLD} \
|
||||
$(SRCEXT) $(TRGEXT) \
|
||||
$(<:.${SRCEXT}.gz=) \
|
||||
$(@:.${SRCEXT}.gz=); \
|
||||
fi
|
||||
else
|
||||
-if [ -e $< ]; then \
|
||||
ln -s $< $@; \
|
||||
ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz); \
|
||||
fi
|
||||
endif
|
||||
|
||||
%.strict2.${TRGEXT}.gz: %.strict2.${SRCEXT}.gz
|
||||
@echo "done!"
|
||||
|
||||
|
||||
|
||||
|
||||
@ -167,10 +197,13 @@ endif
|
||||
## - line 4: length-of-longest-word
|
||||
|
||||
%.stats: %.gz
|
||||
${GZCAT} $< | wc -lwmc > $@
|
||||
${GZCAT} $< | sed 's/./& /g' | tr ' ' "\n" | sort -u | wc -l >> $@
|
||||
${GZCAT} $< | wc -L >> $@
|
||||
${GZCAT} $< | tr ' ' "\n" | wc -L >> $@
|
||||
@if [ -e $< ]; then \
|
||||
echo ".... create some stats for $<"; \
|
||||
${GZCAT} $< | wc -lwmc > $@; \
|
||||
${GZCAT} $< | sed 's/./& /g' | tr ' ' "\n" | sort -u | wc -l >> $@; \
|
||||
${GZCAT} $< | wc -L >> $@; \
|
||||
${GZCAT} $< | tr ' ' "\n" | wc -L >> $@; \
|
||||
fi
|
||||
|
||||
|
||||
##----------------------------------------------
|
||||
|
@ -40,3 +40,8 @@ elg-eng2all:
|
||||
for l in ${ELG_EU_SELECTED}; do \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
|
||||
done
|
||||
|
||||
elg-eng2missing:
|
||||
for l in est lav ron hbs sqi spa fra ita por zlw ara heb deu fin; do \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
|
||||
done
|
||||
|
26
lib/tasks.mk
26
lib/tasks.mk
@ -17,20 +17,27 @@ include ${REPOHOME}lib/allas.mk
|
||||
include ${REPOHOME}lib/dist.mk
|
||||
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
# create a model-specific config file
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
.PHONY: config local-config
|
||||
config local-config: ${WORKDIR}/${MODELCONFIG}
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
# make various data sets (and word alignment)
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
.PHONY: data
|
||||
data:
|
||||
data:
|
||||
@${MAKE} rawdata
|
||||
@${MAKE} ${WORKDIR}/${MODELCONFIG}
|
||||
@${MAKE} ${TRAINDATA_SRC} ${TRAINDATA_TRG}
|
||||
@${MAKE} ${DEVDATA_SRC} ${DEVDATA_TRG}
|
||||
@${MAKE} ${TESTDATA_SRC} ${TESTDATA_TRG}
|
||||
@${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
|
||||
@${MAKE} local-config
|
||||
@${MAKE} traindata
|
||||
@${MAKE} devdata
|
||||
@${MAKE} testdata
|
||||
@${MAKE} vocab
|
||||
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
|
||||
@${MAKE} ${TRAIN_ALG}
|
||||
@${MAKE} wordalign
|
||||
endif
|
||||
|
||||
traindata: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
|
||||
@ -160,7 +167,7 @@ endif
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
## copy different HPC params for jobs that need to wordalign data or not
|
||||
ifeq ($(findstring align,${MODELTYPE}),)
|
||||
ifeq ($(findstring align,${MODELTYPE}),align)
|
||||
DATAJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS}
|
||||
ALLJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
|
||||
else
|
||||
@ -168,7 +175,6 @@ else
|
||||
ALLJOB_HPCPARAMS = ${DATA_PREPARE_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
|
||||
endif
|
||||
|
||||
|
||||
# all-job:
|
||||
# - check whether data files exist
|
||||
# - if not: create a CPU job that makes the data and starts a training job after that
|
||||
@ -202,8 +208,8 @@ ifdef SLURM_JOBID
|
||||
make ${TRAINJOB_HPCPARAMS} SBATCH_ARGS="-d afterok:${SLURM_JOBID}" train-and-eval.submit${GPUJOB_SUBMIT}
|
||||
${MAKE} data
|
||||
else
|
||||
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}
|
||||
${MAKE} data
|
||||
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}
|
||||
endif
|
||||
|
||||
|
||||
|
@ -226,42 +226,56 @@ ifneq ($(filter ${LANGPAIR},${TATOEBA_LANGPAIRS}),${LANGPAIR})
|
||||
else
|
||||
|
||||
%/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz:
|
||||
${MAKE} ${TMPWORKDIR}/$@.d/source.labels ${TMPWORKDIR}/$@.d/target.labels
|
||||
@if [ `cat ${TMPWORKDIR}/$@.d/source.labels ${TMPWORKDIR}/$@.d/target.labels | wc -w` -gt 1 ]; then \
|
||||
mkdir -p ${dir $@} ${TMPWORKDIR}/${notdir $@}.d
|
||||
ln -s ${TMPWORKDIR}/${notdir $@}.d $@.d
|
||||
@${MAKE} $@.d/source.labels $@.d/target.labels
|
||||
@if [ `cat $@.d/source.labels $@.d/target.labels | wc -w` -gt 1 ]; then \
|
||||
echo ".... found sublanguages in the data"; \
|
||||
b="${TMPWORKDIR}/$@.d/${TATOEBADATA}"; \
|
||||
for s in `cat ${TMPWORKDIR}/$@.d/source.labels`; do \
|
||||
for t in `cat ${TMPWORKDIR}/$@.d/target.labels`; do \
|
||||
if [ "$$s" \< "$$t" ]; then \
|
||||
echo ".... extract $$s-$$t data"; \
|
||||
for d in dev test train; do \
|
||||
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.src.gz) <(gzip -cd $$b/$$d.trg.gz) | \
|
||||
grep -P "^$$s\t$$t\t" > ${TMPWORKDIR}/$@.d/$$d; \
|
||||
if [ -s ${TMPWORKDIR}/$@.d/$$d ]; then \
|
||||
cut -f1,2 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
|
||||
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz; \
|
||||
cut -f3 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
|
||||
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.gz; \
|
||||
cut -f4 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
|
||||
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.gz; \
|
||||
fi \
|
||||
done; \
|
||||
if [ -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz ]; then \
|
||||
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | \
|
||||
grep -P "^$$s\t$$t\t" | cut -f3 | \
|
||||
${GZIP} -c > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz; \
|
||||
${ZCAT} ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz |\
|
||||
sort -u > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domains; \
|
||||
echo "$$s" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.labels; \
|
||||
echo "$$t" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.labels; \
|
||||
b="$@.d/${TATOEBADATA}"; \
|
||||
for s in `cat $@.d/source.labels`; do \
|
||||
for t in `cat $@.d/target.labels`; do \
|
||||
echo ".... extract $$s-$$t data"; \
|
||||
for d in dev test train; do \
|
||||
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.src.gz) <(gzip -cd $$b/$$d.trg.gz) | \
|
||||
grep -P "^$$s\t$$t\t" > $@.d/$$d; \
|
||||
if [ -s $@.d/$$d ]; then \
|
||||
if [ "$$s" \< "$$t" ]; then \
|
||||
c="${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean"; \
|
||||
cut -f1,2 $@.d/$$d | ${GZIP} -c > $$c.id.gz; \
|
||||
else \
|
||||
c="${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean"; \
|
||||
cut -f1,2 $@.d/$$d | \
|
||||
awk ' { t = $$1; $$1 = $$2; $$2 = t; print; } ' FS='\t' OFS='\t' |\
|
||||
${GZIP} -c > $$c.id.gz; \
|
||||
fi; \
|
||||
cut -f3 $@.d/$$d | ${GZIP} -c > $$c.$$s.gz; \
|
||||
cut -f4 $@.d/$$d | ${GZIP} -c > $$c.$$t.gz; \
|
||||
fi \
|
||||
done; \
|
||||
if [ -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz ]; then \
|
||||
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | \
|
||||
grep -P "^$$s\t$$t\t" | cut -f3 | \
|
||||
${GZIP} -c > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz; \
|
||||
${ZCAT} ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz |\
|
||||
sort -u > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domains; \
|
||||
echo "$$s" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.labels; \
|
||||
echo "$$t" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.labels; \
|
||||
fi \
|
||||
done \
|
||||
done \
|
||||
fi
|
||||
## NOTE: always need to copy label files to keep all sublanguages
|
||||
## --> this is confusing if a sublanguage has the same ID as the macro-language
|
||||
## --> example: ron includes ron and mol
|
||||
## --> the label file for ron will include mol but the data files will not
|
||||
## TODO: can we do that in a better way somehow?
|
||||
@mv $@.d/source.labels \
|
||||
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels
|
||||
@mv $@.d/target.labels \
|
||||
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels
|
||||
@if [ ! -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
|
||||
echo ".... move data files"; \
|
||||
b="${TMPWORKDIR}/$@.d/${TATOEBADATA}"; \
|
||||
b="$@.d/${TATOEBADATA}"; \
|
||||
for d in dev test train; do \
|
||||
mv $$b/$$d.src.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.gz; \
|
||||
mv $$b/$$d.trg.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.gz; \
|
||||
@ -270,13 +284,9 @@ else
|
||||
${ZCAT} $$b/train.domain.gz | sort -u | tr "\n" ' ' | sed 's/ *$$//' \
|
||||
> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domains; \
|
||||
mv $$b/train.domain.gz ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain.gz; \
|
||||
mv ${TMPWORKDIR}/$@.d/source.labels \
|
||||
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels; \
|
||||
mv ${TMPWORKDIR}/$@.d/target.labels \
|
||||
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels; \
|
||||
fi
|
||||
@echo ".... cleanup of temporary files"
|
||||
@rm -fr ${TMPWORKDIR}/$@.d
|
||||
@rm -fr ${TMPWORKDIR}/${notdir $@}.d $@.d
|
||||
|
||||
endif
|
||||
|
||||
@ -332,7 +342,8 @@ endif
|
||||
@echo ".... fix language codes"
|
||||
@mkdir -p ${dir $@}${TATOEBADATA}
|
||||
@if [ -e ${dir $@}${TATOEBADATA}/train.id.gz ]; then \
|
||||
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f2,3 $(FIXLANGIDS) | ${GZIP} -c > ${dir $@}train.id.gz; \
|
||||
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f2,3 $(FIXLANGIDS) | \
|
||||
${GZIP} -c > ${dir $@}train.id.gz; \
|
||||
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f1 | ${GZIP} -c > ${dir $@}train.domain.gz; \
|
||||
mv ${dir $@}train.id.gz ${dir $@}train.domain.gz ${dir $@}${TATOEBADATA}/; \
|
||||
else \
|
||||
|
100
scripts/bitext_filter.pl
Executable file
100
scripts/bitext_filter.pl
Executable file
@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
#
|
||||
# bitext_filter.pl srclang trglang inputbase outputbase
|
||||
# assumes that data is in
|
||||
# inputbase.srclang.gz
|
||||
# inputbase.trglang.gz
|
||||
|
||||
use strict;
|
||||
use Getopt::Std;
|
||||
use vars qw/$opt_c $opt_l $opt_v/;
|
||||
|
||||
getopts('c:l:v');
|
||||
|
||||
my $CharLengthRatio = 2 || $opt_l;
|
||||
my $UniqueCharRatio = 2 || $opt_c;
|
||||
|
||||
|
||||
die "USAGE: bitext_filter.pl srclang trglang inputbase outputbase"
|
||||
unless ($#ARGV == 3);
|
||||
|
||||
my ($src, $trg, $inputbase, $outputbase) = @ARGV;
|
||||
|
||||
die "input and output have the same name" if ($inputbase eq $outputbase);
|
||||
|
||||
|
||||
my $inputsrcfile = "$inputbase.$src.gz";
|
||||
my $inputtrgfile = "$inputbase.$trg.gz";
|
||||
|
||||
my $outputsrcfile = "$outputbase.$src.gz";
|
||||
my $outputtrgfile = "$outputbase.$trg.gz";
|
||||
|
||||
|
||||
open SI,"gzip -cd <$inputsrcfile |" || die "cannot read from $inputsrcfile\n";
|
||||
open TI,"gzip -cd <$inputtrgfile |" || die "cannot read from $inputtrgfile\n";
|
||||
|
||||
open SO,"| gzip -c >$outputsrcfile" || die "cannot read from $inputsrcfile\n";
|
||||
open TO,"| gzip -c >$outputtrgfile" || die "cannot read from $inputtrgfile\n";
|
||||
|
||||
|
||||
binmode(SI,":utf8");
|
||||
binmode(TI,":utf8");
|
||||
binmode(SO,":utf8");
|
||||
binmode(TO,":utf8");
|
||||
binmode(STDOUT,":utf8");
|
||||
binmode(STDERR,":utf8");
|
||||
|
||||
my $count = 0;
|
||||
my $skipped = 0;
|
||||
my $skippedLength = 0;
|
||||
my $skippedAlphabet = 0;
|
||||
|
||||
while (my $s = <SI>){
|
||||
$count++;
|
||||
unless ($opt_v){
|
||||
print STDERR '.' unless ($count % 100000);
|
||||
print STDERR " $count\n" unless ($count % 5000000);
|
||||
}
|
||||
|
||||
my $t = <TI>;
|
||||
my $sl = length($s);
|
||||
my $tl = length($t);
|
||||
unless ($sl && $tl){
|
||||
$skipped++;
|
||||
next;
|
||||
}
|
||||
my $LengthRatio = $sl > $tl ? $sl/$tl : $tl/$sl;
|
||||
|
||||
if ($LengthRatio > $CharLengthRatio){
|
||||
print STDERR "skip line $count (length ratio $LengthRatio > $CharLengthRatio)\n" if ($opt_v);
|
||||
$skipped++;
|
||||
$skippedLength++;
|
||||
next;
|
||||
}
|
||||
my %s = ();
|
||||
my %t = ();
|
||||
map { $s{$_}++ } split(//,$s);
|
||||
map { $t{$_}++ } split(//,$t);
|
||||
|
||||
my $sa = scalar keys %s;
|
||||
my $ta = scalar keys %t;
|
||||
|
||||
my $AlphabetRatio = $sa > $ta ? $sa/$ta : $ta/$sa;
|
||||
|
||||
if ( $AlphabetRatio > $UniqueCharRatio ){
|
||||
print STDERR "skip line $count (unique char ratio $AlphabetRatio > $UniqueCharRatio\n" if ($opt_v);
|
||||
$skipped++;
|
||||
$skippedAlphabet++;
|
||||
next;
|
||||
}
|
||||
print SO $s;
|
||||
print TO $t;
|
||||
}
|
||||
|
||||
|
||||
print "\noriginal: $count\n";
|
||||
print "skipped : $skipped\n";
|
||||
print " skipped because of character length ratio: $skippedLength\n";
|
||||
print " skipped because of alphabet size ratio : $skippedAlphabet\n";
|
||||
print "retained: ",$count-$skipped,"\n";
|
@ -108,21 +108,10 @@ REPOHOME := ${PWD}/../
|
||||
# TATOEBA_VERSION = v2020-07-28
|
||||
TATOEBA_VERSION = v2021-08-07
|
||||
|
||||
SMALLEST_TRAINSIZE = 1000
|
||||
USE_REST_DEVDATA = 0
|
||||
DATA_IS_SHUFFLED = 1
|
||||
DEVSIZE = 5000
|
||||
TESTSIZE = 10000
|
||||
DEVMINSIZE = 200
|
||||
|
||||
# by default skip aligned data of the same language
|
||||
SKIP_SAME_LANG = 1
|
||||
|
||||
|
||||
|
||||
include ${REPOHOME}lib/env.mk
|
||||
include ${REPOHOME}lib/config.mk
|
||||
include ${REPOHOME}lib/config/tatoeba.mk
|
||||
include ${REPOHOME}lib/config.mk
|
||||
include ${REPOHOME}lib/tasks.mk
|
||||
include ${REPOHOME}lib/tasks/tatoeba/data.mk
|
||||
include ${REPOHOME}lib/tasks/tatoeba/tune.mk
|
||||
@ -133,11 +122,9 @@ include ${REPOHOME}lib/projects/elg.mk
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all:
|
||||
${MAKE} prepare
|
||||
${MAKE} tatoeba-prepare
|
||||
${MAKE} data-tatoeba
|
||||
${MAKE} train-tatoeba
|
||||
${MAKE} eval-tatoeba
|
||||
@ -188,7 +175,6 @@ prepare tatoeba-prepare: ${TATOEBA_LANGIDS_TRAINONLY}
|
||||
prepare-and-data tatoeba-prepare-and-data: ${TATOEBA_LANGIDS_TRAINONLY}
|
||||
${MAKE} fetch-datasets
|
||||
${MAKE} langlabel-files
|
||||
${MAKE} local-config-tatoeba
|
||||
${MAKE} data-tatoeba
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user