diff --git a/ft-tatoeba/Makefile b/ft-tatoeba/Makefile index 9a6b718d..e64d20ba 100644 --- a/ft-tatoeba/Makefile +++ b/ft-tatoeba/Makefile @@ -4,6 +4,16 @@ # # only works with sentencepiece models! # +# TODO's +# +# - forward-translate monolingual data (re-use bt-data) +# - reconstruction filtering (score translation in opposite direction) +# (use weights? normalise-script from bergamot/students) +# - other kind of data filtering / selection? +# - create lexical shortlists (see bergamot) +# - finetune alphas in intgemm8 models (see bergamot) +# - benchmark distilled models +# PWD := ${shell pwd} REPOHOME := ${PWD}/../ @@ -112,7 +122,7 @@ translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG} ## translate all parts .PHONY: translate-all-parts translate-all-parts: ${ALL_BITEXT_LATEST_TRG} - ${MAKE} ${ALL_BITEXT_LATEST_SRC} + ${MAKE} source-all-parts .PHONY: source-all-parts source-all-parts: ${ALL_BITEXT_LATEST_SRC} @@ -183,16 +193,14 @@ endif ## (Why? because we filter out some data from the original wiki text, see above) ${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz -ifneq ($(wildcard ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@}),) - mkdir -p ${dir $@} - ${GZCAT} $< |\ - sed 's/ //g;s/▁/ /g' | \ - sed 's/^ *//;s/ *$$//' |\ - sed 's/^>>[a-z]*<< //' |\ - gzip -c > $@ -endif - - + if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \ + mkdir -p ${dir $@}; \ + ${GZCAT} $< |\ + sed 's/ //g;s/▁/ /g' | \ + sed 's/^ *//;s/ *$$//' |\ + sed 's/^>>[a-z]*<< //' |\ + gzip -c > $@; \ + fi ## overwrite the file with the latest translations diff --git a/lib/config.mk b/lib/config.mk index a5b62b9d..1e70f6a7 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -88,9 +88,12 @@ SKIP_SAME_LANG ?= 0 ## --> especially useful in connection with FIT_DATA_SIZE ## set DATA_IS_SHUFFLED=1 if the training data is already shuffled ## --> useful to avoid shuffling when training sentence piece model +## NEW (2021-12-16): SHUFFLE_DATA is now set by default +## --> can now also avoid sqlite and data shuffling inside MarianNMT +## --> is that a problem (would MarianNMT use different random shuffles / epoch?) ##---------------------------------------------------------------------- -# SHUFFLE_DATA = 1 +SHUFFLE_DATA = 1 # DATA_IS_SHUFFLED = 1 ## devtest data is shuffled by default @@ -142,9 +145,9 @@ SORTSRC = ${firstword ${SORTLANGS}} SORTTRG = ${lastword ${SORTLANGS}} LANGPAIR = ${SORTSRC}-${SORTTRG} SPACE = $(empty) $(empty) -LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)} -LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)} -LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR} +LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)} +LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)} +LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR} ## for monolingual things @@ -179,17 +182,17 @@ endif ## NEW default size = 2500 (keep more for training for small languages) ## NOTE: size will be increased to 5000 for Tatoeba -DEVSIZE = 2500 -TESTSIZE = 2500 +DEVSIZE ?= 2500 +TESTSIZE ?= 2500 ## set some additional thresholds for ## the size of test and dev data ## DEVMINSIZE is the absolute minimum we require ## to run any training procedures -DEVSMALLSIZE = 1000 -TESTSMALLSIZE = 1000 -DEVMINSIZE = 250 +DEVSMALLSIZE ?= 1000 +TESTSMALLSIZE ?= 1000 +DEVMINSIZE ?= 250 ## set additional argument options for opus_read (if it is used) @@ -486,12 +489,14 @@ MARIAN_CLIP_NORM ?= 5 ## default = shuffle data and batches ## (set to batches or none to change this) -MARIAN_SHUFFLE ?= data +# MARIAN_SHUFFLE ?= data +MARIAN_SHUFFLE ?= batches ## default: use sqlite database to store data ## remove this to use regular temp data ## set to --shuffle-in-ram to keep all shuffled data in RAM -MARIAN_DATA_STORAGE ?= --sqlite +# MARIAN_DATA_STORAGE ?= --sqlite + ## set to global for lower memory usage in multiprocess training ## TODO: does this parameter really work? @@ -596,11 +601,11 @@ endif .PHONY: config local-config config local-config: ${WORKDIR}/${MODELCONFIG} -SMALLEST_TRAINSIZE = 10000 -SMALL_TRAINSIZE = 100000 -MEDIUM_TRAINSIZE = 500000 -LARGE_TRAINSIZE = 1000000 -LARGEST_TRAINSIZE = 10000000 +SMALLEST_TRAINSIZE ?= 10000 +SMALL_TRAINSIZE ?= 100000 +MEDIUM_TRAINSIZE ?= 500000 +LARGE_TRAINSIZE ?= 1000000 +LARGEST_TRAINSIZE ?= 10000000 ${WORKDIR}/${MODELCONFIG}: mkdir -p ${dir $@} diff --git a/lib/data.mk b/lib/data.mk index e33af13a..c72d2dd9 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -55,9 +55,8 @@ endif ## - use only the latest backtranslations ## if such a subdir exists -BACKTRANS_HOME = backtranslate -FORWARDTRANS_HOME = ${BACKTRANS_HOME} -# FORWARDTRANS_HOME = ${BACKTRANS_HOME} +BACKTRANS_HOME ?= backtranslate +FORWARDTRANS_HOME ?= ${BACKTRANS_HOME} ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},) BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}/latest @@ -71,6 +70,13 @@ else FORWARDTRANS_DIR = ${FORWARDTRANS_HOME}/${SRC}-${TRG} endif +ifneq (${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest},) + FORWARDTRANSMONO_DIR = ${BACKTRANS_HOME}/${SRC}-${TRG}/latest +else + FORWARDTRANSMONO_DIR = ${BACKTRANS_HOME}/${SRC}-${TRG} +endif + + ## TODO: make it possible to select only parts of the BT data ## ---> use TRAINDATA_SIZE to take max the same amount of all shuffled BT data @@ -85,6 +91,11 @@ ifeq (${USE_FORWARDTRANS},1) FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}} endif +ifeq (${USE_FORWARDTRANSMONO},1) + FORWARDTRANSMONO_SRC = ${sort ${wildcard ${FORWARDTRANSMONO_DIR}/*.${SRCEXT}.gz}} + FORWARDTRANSMONO_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANSMONO_SRC}} +endif + ifeq (${USE_PIVOTING},1) PIVOTING_SRC = ${sort ${wildcard pivoting/${SRC}-${TRG}/latest/*.${SRCEXT}.gz} \ ${wildcard pivoting/${TRG}-${SRC}/latest/*.${SRCEXT}.gz}} @@ -95,6 +106,10 @@ print-ft-data: @echo ${FORWARDTRANS_SRC} @echo ${FORWARDTRANS_TRG} @echo ${FORWARDTRANS_DIR} + @echo ${FORWARDTRANSMONO_SRC} + @echo ${FORWARDTRANSMONO_TRG} + @echo ${FORWARDTRANSMONO_DIR} + ##------------------------------------------------------------- ## data sets (train/dev/test) @@ -104,7 +119,7 @@ print-ft-data: ## with some basic pre-processing (see lib/preprocess.mk) CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \ - ${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${PIVOTING_SRC} + ${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC} CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}} CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}} @@ -239,6 +254,8 @@ MAX_WORDALIGN_SIZE = 5000000 ## (assuming that each of them occupies up to 6 cores NR_ALIGN_JOBS ?= $$(( ${CPU_CORES} / 6 + 1 )) +## job forcing doesn't work within recipes +# ${MAKE} -j ${NR_ALIGN_JOBS} $$a ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz @@ -250,7 +267,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ split -l ${MAX_WORDALIGN_SIZE} $(LOCAL_TRAIN_TRG).algtmp $(LOCAL_TRAIN_TRG).algtmp.d/; \ a=`ls $(LOCAL_TRAIN_SRC).algtmp.d/* | sed 's#$$#.alg#' | xargs`; \ if [ "$$a" != "" ]; then \ - ${MAKE} -j ${NR_ALIGN_JOBS} $$a; \ + ${MAKE} $$a; \ cat $(LOCAL_TRAIN_SRC).algtmp.d/*.alg | ${GZIP} -c > $@; \ rm -f ${LOCAL_TRAIN_SRC}.algtmp.d/*; \ rm -f ${LOCAL_TRAIN_TRG}.algtmp.d/*; \ @@ -449,7 +466,7 @@ endif # --> shuffle data for each langpair # --> do this when FIT_DATA_SIZE is set! ###################################### -ifneq (${SHUFFLE_DATA},1) +ifeq (${SHUFFLE_DATA},1) @echo "shuffle training data" @paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\ ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled @@ -503,10 +520,10 @@ raw-devdata: ${DEV_SRC} ${DEV_TRG} ## maybe introduce over/undersampling of dev data like we have for train data? ${DEV_SRC}.shuffled.gz: - mkdir -p ${dir $@} + mkdir -p ${sort ${dir $@} ${dir ${DEV_SRC}} ${dir ${DEV_TRG}}} rm -f ${DEV_SRC} ${DEV_TRG} - echo "# Validation data" > ${dir ${DEV_SRC}}/README.md - echo "" >> ${dir ${DEV_SRC}}/README.md + echo "# Validation data" > ${dir ${DEV_SRC}}README.md + echo "" >> ${dir ${DEV_SRC}}README.md -for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \ diff --git a/lib/dist.mk b/lib/dist.mk index 89b87dc6..0a362af2 100644 --- a/lib/dist.mk +++ b/lib/dist.mk @@ -7,12 +7,12 @@ TODAY := ${shell date +%F} DATE ?= ${TODAY} -OBJECTSTORAGE = https://object.pouta.csc.fi -MODEL_CONTAINER = OPUS-MT-models -DEV_MODEL_CONTAINER = OPUS-MT-dev -MODELINDEX = ${OBJECTSTORAGE}/${MODEL_CONTAINER}/index.txt -MODELSHOME = ${WORKHOME}/models -RELEASEDIR = ${PWD}/models +OBJECTSTORAGE ?= https://object.pouta.csc.fi +MODEL_CONTAINER ?= OPUS-MT-models +DEV_MODEL_CONTAINER ?= OPUS-MT-dev +MODELINDEX ?= ${OBJECTSTORAGE}/${MODEL_CONTAINER}/index.txt +MODELSHOME ?= ${WORKHOME}/models +RELEASEDIR ?= ${PWD}/models ## TODO: better create a recipe for the yaml file and not the zip file @@ -41,7 +41,7 @@ find-model: ## minimum BLEU score for models to be accepted as distribution package -MIN_BLEU_SCORE = 20 +MIN_BLEU_SCORE ?= 20 .PHONY: dist local-dist global-dist release diff --git a/lib/env.mk b/lib/env.mk index 498ee3d1..bb32bb35 100644 --- a/lib/env.mk +++ b/lib/env.mk @@ -13,7 +13,7 @@ PWD ?= ${shell pwd} NR_GPUS = 1 HPC_NODES = 1 -HPC_DISK = 500 +# HPC_DISK = 500 HPC_QUEUE = serial HPC_GPUQUEUE = gpu @@ -81,8 +81,8 @@ TMPDIR ?= /tmp ## tools and their locations -SCRIPTDIR ?= ${PWD}/scripts -TOOLSDIR ?= ${PWD}/tools +SCRIPTDIR ?= ${REPOHOME}scripts +TOOLSDIR ?= ${REPOHOME}tools ISO639 ?= ${shell which iso639 2>/dev/null || echo 'perl ${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639'} PIGZ ?= ${shell which pigz 2>/dev/null || echo ${TOOLSDIR}/pigz/pigz} diff --git a/lib/generic.mk b/lib/generic.mk index b2b63cc7..eddcd412 100644 --- a/lib/generic.mk +++ b/lib/generic.mk @@ -274,8 +274,8 @@ endif ## --> make a new BPE/sentencepiece model ## --> make a new config file -DEFAULT_PIVOT_LANG = en -PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG} +DEFAULT_PIVOT_LANG ?= en +PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG} %-pivotlang: if [ "$(sort ${SRCLANGS} ${TRGLANGS} ${PIVOT_LANG})" != "$(sort ${SRCLANGS} ${TRGLANGS})" ]; then \ @@ -316,6 +316,11 @@ endif MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \ ${@:-ft=} +## add forward translation of monolingual data +%-ftmono: + ${MAKE} DATASET=${DATASET}+ftmono \ + USE_FORWARDTRANSMONO=1 \ + ${@:-ftmono=} ## train on back-translations only diff --git a/lib/projects/tatoeba.mk b/lib/projects/tatoeba.mk index ed49a16d..d5073f8f 100644 --- a/lib/projects/tatoeba.mk +++ b/lib/projects/tatoeba.mk @@ -188,7 +188,6 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \ TESTSET_NAME=${TATOEBA_TESTSET_NAME} \ TRAINSET_NAME=${TATOEBA_TRAINSET_NAME} \ SMALLEST_TRAINSIZE=1000 \ - DATA_IS_SHUFFLED=1 \ USE_REST_DEVDATA=0 \ HELDOUTSIZE=0 \ DEVSIZE=5000 \ @@ -206,21 +205,24 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \ DEFAULT_PIVOT_LANG=${TATOEBA_PIVOT} \ MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU} -MARIAN_SHUFFLE=data -MARIAN_DATA_STORAGE=--sqlite -HPC_DISK=500 -## unless we have multilingual models: -## no need to shuffle data again, just shuffle batches -## no need to store data in sqlite databases -ifeq (${words ${SRCLANGS}},1) -ifeq (${words ${TRGLANGS}},1) -# TATOEBA_PARAMS += MARIAN_SHUFFLE=batches MARIAN_DATA_STORAGE= HPC_DISK= - MARIAN_SHUFFLE=batches - MARIAN_DATA_STORAGE= - HPC_DISK= -endif -endif +## NEW (2012-12-15): use default (always shuffle training data) +# +# DATA_IS_SHUFFLED = 1 +# MARIAN_SHUFFLE = data +# MARIAN_DATA_STORAGE = --sqlite +# HPC_DISK = 500 + +# ## unless we have multilingual models: +# ## no need to shuffle data again, just shuffle batches +# ## no need to store data in sqlite databases +# ifeq (${words ${SRCLANGS}},1) +# ifeq (${words ${TRGLANGS}},1) +# MARIAN_SHUFFLE = batches +# MARIAN_DATA_STORAGE = +# HPC_DISK = +# endif +# endif diff --git a/lib/sentencepiece.mk b/lib/sentencepiece.mk index e87f89d7..6a611eea 100644 --- a/lib/sentencepiece.mk +++ b/lib/sentencepiece.mk @@ -126,16 +126,18 @@ SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.voc mono-spm-vocab: ${SPMVOCAB} + ifneq (${SPMVOCAB},${SPMSRCVOCAB}) ${SPMSRCVOCAB}: - ${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-vocab + ${MAKE} LANGS="${SRCLANGS}" BPESIZE=${SRCBPESIZE} mono-spm-vocab endif +ifneq (${SPMSRCVOCAB},${SPMTRGVOCAB}) ifneq (${SPMVOCAB},${SPMTRGVOCAB}) ${SPMTRGVOCAB}: - ${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab + ${MAKE} LANGS="${TRGLANGS}" BPESIZE=${TRGBPESIZE} mono-spm-vocab +endif endif - ${SPMVOCAB}: ${LOCAL_MONO_DATA}.${PRE} ${SPMMODEL} ifeq ($(wildcard ${SPMVOCAB}),) @@ -160,10 +162,12 @@ ifneq (${SPMMODEL},${SPMSRCMONO}) ${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model endif +ifneq (${SPMSRCMODEL},${SPMTRGMONO}) ifneq (${SPMMODEL},${SPMTRGMONO}) ${SPMTRGMONO}: ${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model endif +endif ${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} diff --git a/lib/test.mk b/lib/test.mk index 49af7719..ccdcd9c5 100644 --- a/lib/test.mk +++ b/lib/test.mk @@ -79,8 +79,8 @@ endif %.eval: % ${TEST_TRG} paste ${TEST_SRC}.${PRE_SRC} ${TEST_TRG} | grep $$'.\t' | cut -f2 > $@.ref - cat $< | sacrebleu ${SACREBLEU_PARAMS} $@.ref > $@ - cat $< | sacrebleu ${SACREBLEU_PARAMS} --metrics=chrf --width=3 $@.ref >> $@ + cat $< | sacrebleu -f text ${SACREBLEU_PARAMS} $@.ref > $@ + cat $< | sacrebleu -f text ${SACREBLEU_PARAMS} --metrics=chrf --width=3 $@.ref >> $@ rm -f $@.ref diff --git a/lib/train.mk b/lib/train.mk index 23eb62e5..3011c6f7 100644 --- a/lib/train.mk +++ b/lib/train.mk @@ -180,12 +180,11 @@ endif ifeq ($(subst -align,,${MODELTYPE}),transformer-small) - MARIAN_ENC_DEPTH = 3 + MARIAN_ENC_DEPTH = 6 MARIAN_DEC_DEPTH = 2 MARIAN_ATT_HEADS = 8 - MARIAN_DIM_EMB = 256 - MARIAN_EXTRA += --transformer-decoder-autoreg rnn \ - --dec-cell ssru + MARIAN_DIM_EMB = 512 + MARIAN_EXTRA += --transformer-decoder-autoreg rnn --dec-cell ssru # --fp16 endif diff --git a/tatoeba/Makefile b/tatoeba/Makefile index 7d928d0a..d111b5e0 100644 --- a/tatoeba/Makefile +++ b/tatoeba/Makefile @@ -87,27 +87,68 @@ SHELL := bash PWD := ${shell pwd} REPOHOME := ${PWD}/../ -include ${REPOHOME}lib/env.mk -include ${REPOHOME}lib/config.mk -include ${REPOHOME}lib/tasks.mk -include ${REPOHOME}lib/projects.mk - - - -## general parameters for Tatoeba models ## Tatoeba Challenge Data release number # TATOEBA_VERSION ?= v2020-07-28 TATOEBA_VERSION ?= v2021-08-07 TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION}) + +## this will be the base name of the model file +TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN} + +TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION} +TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION} +TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION} + +DATASET = ${TATOEBA_DATASET} +TRAINSET = ${TATOEBA_TRAINSET} +DEVSET = ${TATOEBA_DEVSET} +TESTSET = ${TATOEBA_TESTSET} +DEVSET_NAME = ${TATOEBA_DEVSET} +TESTSET_NAME = ${TATOEBA_TESTSET} +TRAINSET_NAME = ${TATOEBA_TRAINSET} + +SMALLEST_TRAINSIZE = 1000 +USE_REST_DEVDATA = 0 +DATA_IS_SHUFFLED = 1 +DEVSIZE = 5000 +TESTSIZE = 10000 +DEVMINSIZE = 200 + +BACKTRANS_HOME = ${PWD}/back-translate +FORWARDTRANS_HOME = ${PWD}/forward-translate +MODELSHOME = ${PWD}/models +RELEASEDIR = ${PWD}/models + +MODELS_URL = https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} +MODEL_CONTAINER = ${TATOEBA_MODEL_CONTAINER} + +SKIP_DATA_DETAILS = 1 +DEFAULT_PIVOT_LANG = eng +MIN_BLEU_SCORE = 10 +TATOEBA_PIVOT ?= ${DEFAULT_PIVOT_LANG} + +## overwrite the standard langpair string +## basically take ${SRC}-${TRG} +## backoff to first langs in SRCLANGS and TRGLANGS +LANGPAIRSTR := ${firstword ${SRC} ${SRCLANGS}}-${firstword ${TRG} ${TRGLANGS}} + + +include ${REPOHOME}lib/env.mk +include ${REPOHOME}lib/config.mk +include ${REPOHOME}lib/tasks.mk + + + +## general parameters for Tatoeba models + TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge TATOEBA_TEST_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} TATOEBA_MONO_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} -TATOEBA_WORK ?= ${PWD}/work-tatoeba -TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE} -TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono +TATOEBA_DATA ?= ${WORKHOME}/data/${PRE} +TATOEBA_MONO ?= ${WORKHOME}/data/mono ## list of language IDs that only appear in the training data ## (fetched from Tatoeba github) @@ -141,35 +182,21 @@ WIKIMACROLANGS ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}}) ## ObjectStorage container name for Tatoeba models TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models -## this will be the base name of the model file -TATOEBA_DATASET := ${DATASET}TC${TATOEBA_VERSION_NOHYPHEN} - -TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION} -TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION} -TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION} -TATOEBA_DEVSET_NAME := ${TATOEBA_DEVSET} -TATOEBA_TESTSET_NAME := ${TATOEBA_TESTSET} -TATOEBA_TRAINSET_NAME := ${TATOEBA_TRAINSET} -TATOEBA_RELEASEDIR := ${PWD}/models-tatoeba -TATOEBA_MODELSHOME := ${PWD}/models-tatoeba -TATOEBA_BTHOME := ${PWD}/bt-tatoeba -TATOEBA_FTHOME := ${PWD}/ft-tatoeba -TATOEBA_MIN_BLEU := 10 ## file with the source and target languages in the current model -TATOEBA_SRCLABELFILE = ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src -TATOEBA_TRGLABELFILE = ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.trg +TATOEBA_SRCLABELFILE = ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src +TATOEBA_TRGLABELFILE = ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg ## get source and target languages from the label files ifneq (${wildcard ${TATOEBA_SRCLABELFILE}},) - TATOEBA_SRCLANGS = ${shell cat ${TATOEBA_SRCLABELFILE}} + SRCLANGS := ${shell cat ${TATOEBA_SRCLABELFILE}} endif ifneq (${wildcard ${TATOEBA_TRGLABELFILE}},) - TATOEBA_TRGLANGS = ${shell cat ${TATOEBA_TRGLABELFILE}} + TRGLANGS := ${shell cat ${TATOEBA_TRGLABELFILE}} endif -# ifndef USE_TARGET_LABELS + ifdef TATOEBA_TRGLANGS ifneq (${words ${TATOEBA_TRGLANGS}},1) USE_TARGET_LABELS = 1 @@ -177,53 +204,40 @@ ifneq (${words ${TATOEBA_TRGLANGS}},1) endif endif +ifeq (${SRCLANGS},) + SRCLANGS = ${SRC} +endif +ifeq (${TRGLANGS},) + TRGLANGS = ${TRG} +endif + + + ## default parameters for some recipes with language groups ## - modeltype ## - size balancing LANGGROUP_MODELTYPE ?= transformer LANGGROUP_FIT_DATA_SIZE ?= 1000000 -## set the default pivot language to eng - -TATOEBA_PIVOT = eng +## NEW (2012-12-15): use default (always shuffle training data) +# +# DATA_IS_SHUFFLED = 1 +# MARIAN_SHUFFLE = data +# MARIAN_DATA_STORAGE = --sqlite +# HPC_DISK = 500 -TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \ - TATOEBA_DATASET=${TATOEBA_DATASET} \ - TRAINSET=${TATOEBA_TRAINSET} \ - DEVSET=${TATOEBA_DEVSET} \ - TESTSET=${TATOEBA_TESTSET} \ - DEVSET_NAME=${TATOEBA_DEVSET_NAME} \ - TESTSET_NAME=${TATOEBA_TESTSET_NAME} \ - TRAINSET_NAME=${TATOEBA_TRAINSET_NAME} \ - SMALLEST_TRAINSIZE=1000 \ - USE_REST_DEVDATA=0 \ - DATA_IS_SHUFFLED=1 \ - HELDOUTSIZE=0 \ - DEVSIZE=5000 \ - TESTSIZE=10000 \ - DEVMINSIZE=200 \ - WORKHOME=${TATOEBA_WORK} \ - BACKTRANS_HOME=${TATOEBA_BTHOME} \ - FORWARDTRANS_HOME=${TATOEBA_FTHOME} \ - MODELSHOME=${TATOEBA_MODELSHOME} \ - RELEASEDIR=${TATOEBA_RELEASEDIR} \ - MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \ - MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \ - ALT_MODEL_DIR=tatoeba \ - SKIP_DATA_DETAILS=1 \ - DEFAULT_PIVOT_LANG=${TATOEBA_PIVOT} \ - MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU} - -## unless we have multilingual models: -## no need to shuffle data again, just shuffle batches -## no need to store data in sqlite databases -ifeq (${words ${SRCLANGS}},1) -ifeq (${words ${TRGLANGS}},1) - TATOEBA_PARAMS += MARIAN_SHUFFLE=batches MARIAN_DATA_STORAGE= HPC_DISK= -endif -endif +# ## unless we have multilingual models: +# ## no need to shuffle data again, just shuffle batches +# ## no need to store data in sqlite databases +# ifeq (${words ${SRCLANGS}},1) +# ifeq (${words ${TRGLANGS}},1) +# MARIAN_SHUFFLE = batches +# MARIAN_DATA_STORAGE = +# HPC_DISK = +# endif +# endif @@ -245,22 +259,37 @@ OPUS_LANG_GRANDPARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2> OPUS_LANG_GROUPS := ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}} -.PHONY: tatoeba -tatoeba: - ${MAKE} tatoeba-prepare - ${MAKE} all-tatoeba +.PHONY: all +all: + ${MAKE} prepare + ${MAKE} train + ${MAKE} eval + ${MAKE} compare + ${MAKE} eval-testsets + +## prepare data (config, train.dev.test data, labels) +.PHONY: prepare tatoeba-prepare +prepare tatoeba-prepare: ${TATOEBA_LANGIDS_TRAINONLY} + ${MAKE} fetch-datasets + ${MAKE} langlabel-files + ${MAKE} local-config + ${MAKE} data + +.PHONY: trainonly_langids +trainonly_langids: ${TATOEBA_LANGIDS_TRAINONLY} + ## start unidirectional training job ## - make data first, then submit a job -.PHONY: tatoeba-job -tatoeba-job: +.PHONY: tatoeba-job job slurmjob +job slurmjob tatoeba-job: rm -f train-and-eval.submit ${MAKE} tatoeba-prepare ${MAKE} all-job-tatoeba ## start jobs in both translation directions -.PHONY: tatoeba-bidirectional-job -tatoeba-bidirectional-job: +.PHONY: tatoeba-bidirectional-job bidirectional-job +bidirectional-job tatoeba-bidirectional-job: ${MAKE} tatoeba-prepare ${MAKE} all-job-tatoeba ifneq (${SRCLANGS},${TRGLANGS}) @@ -270,43 +299,30 @@ ifneq (${SRCLANGS},${TRGLANGS}) endif -## prepare data (config, train.dev.test data, labels) -.PHONY: tatoeba-prepare -tatoeba-prepare: # ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz - ${MAKE} local-config-tatoeba - ${MAKE} data-tatoeba -## train a model -.PHONY: tatoeba-train -tatoeba-train: - ${MAKE} train-tatoeba - -## evaluate a model -.PHONY: tatoeba-eval -tatoeba-eval: - ${MAKE} compare-tatoeba +## for compatibility: recipes with tatoeba prefix +.PHONY: tatoeba-data tatoeba-train tatoeba-eval tatoeba-compare +tatoeba-data: data +tatoeba-train: train +tatoeba-eval: eval +tatoeba-compare: compare ## fetch the essential data and get labels for language variants ## (this is done by the data targets above as well) .PHONY: tatoeba-data tatoeba-labels -# tatoeba-data: ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz -tatoeba-data: data-tatoeba -tatoeba-labels: ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src \ - ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.trg - -# ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.${LANGPAIRSTR}.clean.${SRCEXT}.labels \ -# ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.${LANGPAIRSTR}.clean.${TRGEXT}.labels +tatoeba-labels: ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src \ + ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg ## restart all language pairs of models that have not yet converged ## TODO: takes only the first model found in the directory tatoeba-continue-unfinished: - for d in `find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ - if [ `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.valid1.log' | grep -v tuned | wc -l` -gt 0 ]; then \ - if [ ! `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.done' | grep -v tuned | wc -l` -gt 0 ]; then \ + for d in `find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ + if [ `find ${WORKHOME}/$$d -maxdepth 1 -name '*.valid1.log' | grep -v tuned | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$d -maxdepth 1 -name '*.done' | grep -v tuned | wc -l` -gt 0 ]; then \ p=`echo $$d | sed 's/-/2/'`; \ - m=`ls ${TATOEBA_WORK}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f3 -d.`; \ - t=`ls ${TATOEBA_WORK}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f1 -d.`; \ + m=`ls ${WORKHOME}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f3 -d.`; \ + t=`ls ${WORKHOME}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f1 -d.`; \ ${MAKE} DATASET=$$t MODELTYPE=$$m tatoeba-$$p-train; \ fi \ fi \ @@ -316,14 +332,14 @@ tatoeba-continue-unfinished: ## unless they are converged already ## TODO: takes only the first model found in the directory tatoeba-continue-unreleased: - find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt1 + find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt1 find ${TATOEBA_MODELSHOME}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt2 for d in `diff $@.tt1 $@.tt2 | grep '<' | cut -f2 -d' '`; do \ - if [ `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.valid1.log' | grep -v tuned | wc -l` -gt 0 ]; then \ - if [ ! `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.done' | grep -v tuned | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/$$d -maxdepth 1 -name '*.valid1.log' | grep -v tuned | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$d -maxdepth 1 -name '*.done' | grep -v tuned | wc -l` -gt 0 ]; then \ p=`echo $$d | sed 's/-/2/'`; \ - m=`ls ${TATOEBA_WORK}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f3 -d.`; \ - t=`ls ${TATOEBA_WORK}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f1 -d.`; \ + m=`ls ${WORKHOME}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f3 -d.`; \ + t=`ls ${WORKHOME}/$$d/*.valid1.log | head -1 | cut -f3 -d/ | cut -f1 -d.`; \ ${MAKE} DATASET=$$t MODELTYPE=$$m tatoeba-$$p-train; \ fi \ fi \ @@ -334,8 +350,8 @@ tatoeba-continue-unreleased: ## release all language pairs ## (including lang-group models) tatoeba-release-all: - for d in `find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ - for f in `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.valid1.log' -printf " %f" | grep -v tuned`; do \ + for d in `find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ + for f in `find ${WORKHOME}/$$d -maxdepth 1 -name '*.valid1.log' -printf " %f" | grep -v tuned`; do \ p=`echo $$d | sed 's/-/2/'`; \ m=`echo $$f | cut -f3 -d.`; \ t=`echo $$f | cut -f1 -d.`; \ @@ -346,8 +362,8 @@ tatoeba-release-all: ## release all models that have converged tatoeba-release-finished: - for d in `find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ - for f in `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.done' -printf " %f" | grep -v tuned`; do \ + for d in `find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ + for f in `find ${WORKHOME}/$$d -maxdepth 1 -name '*.done' -printf " %f" | grep -v tuned`; do \ p=`echo $$d | sed 's/-/2/'`; \ m=`echo $$f | cut -f3 -d.`; \ t=`echo $$f | cut -f1 -d.`; \ @@ -359,10 +375,10 @@ tatoeba-release-finished: ## release all models that are not yet released tatoeba-release-unreleased: - find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt1 + find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt1 find ${TATOEBA_MODELSHOME}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt2 for d in `diff $@.tt1 $@.tt2 | grep '<' | cut -f2 -d' '`; do \ - for f in `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.valid1.log' -printf " %f" | grep -v tuned`; do \ + for f in `find ${WORKHOME}/$$d -maxdepth 1 -name '*.valid1.log' -printf " %f" | grep -v tuned`; do \ p=`echo $$d | sed 's/-/2/'`; \ m=`echo $$f | cut -f3 -d.`; \ t=`echo $$f | cut -f1 -d.`; \ @@ -373,10 +389,10 @@ tatoeba-release-unreleased: rm -f $@.tt1 $@.tt2 tatoeba-release-unreleased-test: - find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt1 + find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt1 find ${TATOEBA_MODELSHOME}/ -maxdepth 1 -type d -name '???-???' -printf "%f\n" | sort > $@.tt2 for d in `diff $@.tt1 $@.tt2 | grep '<' | cut -f2 -d' '`; do \ - for f in `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.valid1.log' -printf " %f" | grep -v tuned`; do \ + for f in `find ${WORKHOME}/$$d -maxdepth 1 -name '*.valid1.log' -printf " %f" | grep -v tuned`; do \ p=`echo $$d | sed 's/-/2/'`; \ m=`echo $$f | cut -f3 -d.`; \ t=`echo $$f | cut -f1 -d.`; \ @@ -390,8 +406,8 @@ tatoeba-release-unreleased-test: ## ---> be aware of the danger of overwriting existing files ## ---> backups are stored in models-backup tatoeba-refresh-finished: - for d in `find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ - for f in `find ${TATOEBA_WORK}/$$d -maxdepth 1 -name '*.done' -printf "%A@\t%f\n" | sort -nr | cut -f2 | grep -v tuned | head -1`; do \ + for d in `find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' -printf " %f"`; do \ + for f in `find ${WORKHOME}/$$d -maxdepth 1 -name '*.done' -printf "%A@\t%f\n" | sort -nr | cut -f2 | grep -v tuned | head -1`; do \ p=`echo $$d | sed 's/-/2/'`; \ m=`echo $$f | cut -f3 -d.`; \ t=`echo $$f | cut -f1 -d.`; \ @@ -557,7 +573,7 @@ tatoeba-all2trg-small: tatoeba-wiki2eng: for l in ${WIKIMACROLANGS}; do \ - if [ ! `find ${TATOEBA_WORK}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ ${MAKE} SRCLANGS=$$l TRGLANGS=eng tatoeba-job; \ fi \ done @@ -565,7 +581,7 @@ tatoeba-wiki2eng: ## macro-languages that we missed before tatoeba-wiki2eng-macro: for l in $(filter-out ${WIKILANGS},${WIKIMACROLANGS}); do \ - if [ ! `find ${TATOEBA_WORK}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ ${MAKE} SRCLANGS=$$l TRGLANGS=eng tatoeba-job; \ fi \ done @@ -575,11 +591,11 @@ tatoeba-print-missing-wiki: tatoeba-wiki2eng-parent: for l in ${WIKIMACROLANGS}; do \ - if [ ! `find ${TATOEBA_WORK}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ echo "check $$l-eng"; \ - if [ `find ${TATOEBA_WORK}/$$l-eng/train -name '${TATOEBA_DATASET}.src.clean.spm*.gz' 2>/dev/null | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/$$l-eng/train -name '${TATOEBA_DATASET}.src.clean.spm*.gz' 2>/dev/null | wc -l` -gt 0 ]; then \ echo "check data size of $$l-eng"; \ - if [ `find ${TATOEBA_WORK}/$$l-eng/train -name '${TATOEBA_DATASET}.src.clean.spm*.gz' 2>/dev/null | xargs zcat | head -100000 | wc -l` -lt 100000 ]; then \ + if [ `find ${WORKHOME}/$$l-eng/train -name '${TATOEBA_DATASET}.src.clean.spm*.gz' 2>/dev/null | xargs zcat | head -100000 | wc -l` -lt 100000 ]; then \ p=`langgroup -p $$l`; \ echo "${MAKE} SRCLANGS=$$p TRGLANGS=eng tatoeba-$${p}2eng-train-1m"; \ fi \ @@ -591,14 +607,14 @@ tatoeba-wiki2eng-done: for l in ${WIKIMACROLANGS}; do \ if [ `find ${TATOEBA_MODELSHOME}/$$l-eng -name '*.zip' 2>/dev/null | wc -l` -gt 0 ]; then \ echo "model available for $$l-eng"; \ - elif [ `find ${TATOEBA_WORK}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ + elif [ `find ${WORKHOME}/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \ echo -n "model aivailbale for $$l-eng but not released"; \ - if [ `find ${TATOEBA_WORK}/$$l-eng -name '*.eval' 2>/dev/null | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/$$l-eng -name '*.eval' 2>/dev/null | wc -l` -gt 0 ]; then \ echo -n ", BLEU = "; \ - grep BLEU ${TATOEBA_WORK}/$$l-eng/*eval | head -1 | cut -f3 -d' '; \ - elif [ ! -e ${TATOEBA_WORK}/$$l-eng/test/${TATOEBA_TESTSET}.src ]; then \ + grep BLEU ${WORKHOME}/$$l-eng/*eval | head -1 | cut -f3 -d' '; \ + elif [ ! -e ${WORKHOME}/$$l-eng/test/${TATOEBA_TESTSET}.src ]; then \ echo ", missing eval file"; \ - echo "make TATOEBA_WORK=${TATOEBA_WORK}-tmp SRCLANGS=$$l TRGLANGS=eng data-tatoeba"; \ + echo "make WORKHOME=${WORKHOME}-tmp SRCLANGS=$$l TRGLANGS=eng data-tatoeba"; \ else \ echo ", run 'make tatoeba-$${l}2eng-evalall'"; \ fi \ @@ -675,13 +691,13 @@ tatoeba-zle2langgroups-bt-4m: ## temporary recipe for evaluating all 4m multilingual models that are done tatoeba-eval-4m: - for p in `ls work-tatoeba/*/*4m*done | cut -f2 -d/ | sed 's/\-/2/'`; do \ + for p in `ls ${WORKHOME}/*/*4m*done | cut -f2 -d/ | sed 's/\-/2/'`; do \ make MODELTYPE=transformer tatoeba-$$p-multieval-bt-4m; \ make MODELTYPE=transformer tatoeba-$$p-eval-testsets-bt-4m; \ done tatoeba-dist-4m: - for p in `ls work-tatoeba/*/*4m*done | cut -f2 -d/ | sed 's/\-/2/'`; do \ + for p in `ls ${WORKHOME}/*/*4m*done | cut -f2 -d/ | sed 's/\-/2/'`; do \ make MODELTYPE=transformer tatoeba-$$p-dist-bt-4m; \ done @@ -737,7 +753,7 @@ tatoeba-dist-4m: ## new: only start this if there is a model all-tatoeba-group2eng-dist: for g in ${OPUS_LANG_GROUPS}; do \ - if [ `find ${TATOEBA_WORK}/$$g-eng -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/$$g-eng -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} MODELTYPE=transformer tatoeba-$${g}2eng-eval; \ ${MAKE} MODELTYPE=transformer tatoeba-$${g}2eng-evalall; \ ${MAKE} MODELTYPE=transformer tatoeba-$${g}2eng-dist; \ @@ -746,7 +762,7 @@ all-tatoeba-group2eng-dist: all-tatoeba-eng2group-dist: for g in ${OPUS_LANG_GROUPS}; do \ - if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} MODELTYPE=transformer tatoeba-eng2$${g}-eval; \ ${MAKE} MODELTYPE=transformer tatoeba-eng2$${g}-evalall; \ ${MAKE} MODELTYPE=transformer tatoeba-eng2$${g}-dist; \ @@ -755,7 +771,7 @@ all-tatoeba-eng2group-dist: all-tatoeba-langgroup-dist: for g in ${OPUS_LANG_GROUPS}; do \ - if [ `find ${TATOEBA_WORK}/$$g-$$g -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/$$g-$$g -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} MODELTYPE=transformer PIVOT=eng tatoeba-$${g}2$${g}-eval; \ ${MAKE} MODELTYPE=transformer PIVOT=eng tatoeba-$${g}2$${g}-evalall; \ ${MAKE} MODELTYPE=transformer PIVOT=eng tatoeba-$${g}2$${g}-dist; \ @@ -874,7 +890,7 @@ tatoeba-%-train: t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-train,%,$@))); \ S="${call find-srclanggroup,${patsubst tatoeba-%-train,%,$@},${PIVOT}}"; \ T="${call find-trglanggroup,${patsubst tatoeba-%-train,%,$@},${PIVOT}}"; \ - if [ ! `find ${TATOEBA_WORK}/$$s-$$t -maxdepth 1 -name '${TATOEBA_DATASET}.*${MODELTYPE}.model${NR}.done' | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$s-$$t -maxdepth 1 -name '${TATOEBA_DATASET}.*${MODELTYPE}.model${NR}.done' | wc -l` -gt 0 ]; then \ if [ `echo $$S | tr ' ' "\n" | wc -l` -ge ${MIN_SRCLANGS} ]; then \ if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \ if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \ @@ -905,7 +921,7 @@ tatoeba-%-trainjob: t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-trainjob,%,$@))); \ S="${call find-srclanggroup,${patsubst tatoeba-%-trainjob,%,$@},${PIVOT}}"; \ T="${call find-trglanggroup,${patsubst tatoeba-%-trainjob,%,$@},${PIVOT}}"; \ - if [ ! `find ${TATOEBA_WORK}/$$s-$$t -maxdepth 1 -name '${TATOEBA_DATASET}.*${MODELTYPE}.model${NR}.done' | wc -l` -gt 0 ]; then \ + if [ ! `find ${WORKHOME}/$$s-$$t -maxdepth 1 -name '${TATOEBA_DATASET}.*${MODELTYPE}.model${NR}.done' | wc -l` -gt 0 ]; then \ if [ `echo $$S | tr ' ' "\n" | wc -l` -ge ${MIN_SRCLANGS} ]; then \ if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \ if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \ @@ -947,8 +963,8 @@ tatoeba-%-pivotlang: tatoeba-%-eval: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-eval,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-eval,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ - if [ `find ${TATOEBA_WORK}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ + if [ `find ${WORKHOME}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-eval,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-eval,%,$@},${PIVOT}}" \ @@ -965,8 +981,8 @@ tatoeba-%-multieval: t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-multieval,%,$@))); \ S="${call find-srclanggroup,${patsubst tatoeba-%-multieval,%,$@},${PIVOT}}"; \ T="${call find-trglanggroup,${patsubst tatoeba-%-multieval,%,$@},${PIVOT}}"; \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ - if [ `find ${TATOEBA_WORK}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ + if [ `find ${WORKHOME}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \ TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \ TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \ @@ -982,8 +998,8 @@ tatoeba-%-multieval: tatoeba-%-eval-testsets: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-eval-testsets,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-eval-testsets,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ - if [ `find ${TATOEBA_WORK}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ + if [ `find ${WORKHOME}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-eval-testsets,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-eval-testsets,%,$@},${PIVOT}}" \ @@ -997,8 +1013,8 @@ tatoeba-%-eval-testsets: tatoeba-%-testsets: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-testsets,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-testsets,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ - if [ `find ${TATOEBA_WORK}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ + if [ `find ${WORKHOME}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-testsets,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-testsets,%,$@},${PIVOT}}" \ @@ -1029,7 +1045,7 @@ tatoeba-%-release: tatoeba-%-dist: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-dist,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-dist,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-dist,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-dist,%,$@},${PIVOT}}" \ @@ -1051,7 +1067,7 @@ tatoeba-%-refresh: tatoeba-%-refresh-release-yml tatoeba-%-refresh-release-readm tatoeba-%-refresh-release-readme: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-refresh-release-readme,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-refresh-release-readme,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release-readme,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release-readme,%,$@},${PIVOT}}" \ @@ -1064,7 +1080,7 @@ tatoeba-%-refresh-release-readme: tatoeba-%-refresh-release-yml: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-refresh-release-yml,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-refresh-release-yml,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release-yml,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release-yml,%,$@},${PIVOT}}" \ @@ -1075,7 +1091,7 @@ tatoeba-%-refresh-release-yml: tatoeba-%-refresh-release: tatoeba-%-refresh-release-yml tatoeba-%-refresh-release-readme ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-refresh-release,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-refresh-release,%,$@))); \ - if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -e ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t \ SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release,%,$@},${PIVOT}}" \ TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release,%,$@},${PIVOT}}" \ @@ -1186,7 +1202,7 @@ tatoeba-langtunedist: tatoeba-%-langtune: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langtune,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-langtune,%,$@))); \ - if [ -d ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -d ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t ${TATOEBA_LANGTUNE_PARAMS} tatoeba; \ fi ) @@ -1196,7 +1212,7 @@ tatoeba-%-langtune: tatoeba-%-langtunejob: ( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langtunejob,%,$@))); \ t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-langtunejob,%,$@))); \ - if [ -d ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -d ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} LANGPAIRSTR=$$s-$$t ${TATOEBA_LANGTUNE_PARAMS} tatoeba-job; \ fi ) @@ -1274,10 +1290,10 @@ tatoeba-distsubset-%: tatoeba-%.md for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \ s=`echo $$l | cut -f1 -d '-'`; \ t=`echo $$l | cut -f2 -d '-'`; \ - if [ -d ${TATOEBA_WORK}/$$s-$$t ]; then \ + if [ -d ${WORKHOME}/$$s-$$t ]; then \ ${MAKE} SRCLANGS=$$s TRGLANGS=$$t MIN_BLEU_SCORE=10 release-tatoeba; \ fi; \ - if [ -d ${TATOEBA_WORK}/$$t-$$s ]; then \ + if [ -d ${WORKHOME}/$$t-$$s ]; then \ ${MAKE} SRCLANGS=$$t TRGLANGS=$$s MIN_BLEU_SCORE=10 release-tatoeba; \ fi; \ done @@ -1288,14 +1304,14 @@ tatoeba-evalsubset-%: tatoeba-%.md for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \ s=`echo $$l | cut -f1 -d '-'`; \ t=`echo $$l | cut -f2 -d '-'`; \ - if [ -d ${TATOEBA_WORK}/$$s-$$t ]; then \ - if [ `find ${TATOEBA_WORK}/$$s-$$t -name '*.best-perplexity.npz' | wc -l` -gt 0 ]; then \ + if [ -d ${WORKHOME}/$$s-$$t ]; then \ + if [ `find ${WORKHOME}/$$s-$$t -name '*.best-perplexity.npz' | wc -l` -gt 0 ]; then \ ${MAKE} SRCLANGS=$$s TRGLANGS=$$t compare-tatoeba; \ ${MAKE} SRCLANGS=$$s TRGLANGS=$$t eval-testsets-tatoeba; \ fi \ fi; \ - if [ -d ${TATOEBA_WORK}/$$t-$$s ]; then \ - if [ `find ${TATOEBA_WORK}/$$t-$$s -name '*.best-perplexity.npz' | wc -l` -gt 0 ]; then \ + if [ -d ${WORKHOME}/$$t-$$s ]; then \ + if [ `find ${WORKHOME}/$$t-$$s -name '*.best-perplexity.npz' | wc -l` -gt 0 ]; then \ ${MAKE} SRCLANGS=$$t TRGLANGS=$$s compare-tatoeba; \ ${MAKE} SRCLANGS=$$t TRGLANGS=$$s eval-testsets-tatoeba; \ fi \ @@ -1378,11 +1394,11 @@ tatoeba-trainsize-%.txt: tatoeba-%.md .PHONY: tatoeba-multilingual-eval tatoeba-multilingual-eval: - -${MAKE} ${TATOEBA_PARAMS} tatoeba-multilingual-testsets + -${MAKE} tatoeba-multilingual-testsets ifneq (${words ${SRCLANGS} ${TRGLANGS}},2) for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ - if [ -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src ]; then \ + if [ -e ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src ]; then \ ${MAKE} SRC=$$s TRG=$$t \ TATOEBA_TESTSET=${TATOEBA_TESTSET}.$$s-$$t \ TATOEBA_TESTSET_NAME=${TATOEBA_TESTSET}.$$s-$$t \ @@ -1406,156 +1422,108 @@ tatoeba-sublang-eval: tatoeba-multilingual-eval-tatoeba -## copy testsets into the multilingual model's test directory -.PHONY: tatoeba-multilingual-testsets -tatoeba-multilingual-testsets: ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.done - -# ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done-old: -# @for s in ${SRCLANGS}; do \ -# for t in ${TRGLANGS}; do \ -# if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src ]; then \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \ -# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt ]; then \ -# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ -# if [ "${USE_TARGET_LABELS}" == "1" ]; then \ -# cut -f2,3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ -# sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# else \ -# cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# fi; \ -# cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ -# else \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \ -# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt ]; then \ -# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ -# if [ "${USE_TARGET_LABELS}" == "1" ]; then \ -# cut -f1,4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ -# sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# else \ -# cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# fi; \ -# cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ -# fi \ -# fi; \ -# rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ -# fi \ -# done \ -# done -# if [ -d ${dir $@} ]; then \ -# touch $@; \ -# fi - - -## a rather complex recipe to create testsets for individual language pairs -## in multilingual models +## an overly complex recipe to create testsets for individual language pairs ## - extract test sets for all (macro-)language combinations ## - extract potential sub-language pairs from combinations involving macro-languages +## - store those testsets in the multilingual model's test directory +.PHONY: tatoeba-multilingual-testsets +tatoeba-multilingual-testsets: ${WORKHOME}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.done +MULTILING_TESTSETS_DONE = ${WORKHOME}/${LANGPAIRSTR}/test/Tatoeba-testsets.done \ + ${WORKHOME}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.done -# if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src ]; then \ -# fi \ - -${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done \ -${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.done: - @mkdir -p ${TATOEBA_WORK}/${LANGPAIRSTR}/test +${MULTILING_TESTSETS_DONE}: + @mkdir -p ${WORKHOME}/${LANGPAIRSTR}/test @for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ - wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ + wget -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ ${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \ - if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ - cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ + if [ -s ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ + cat ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ if [ "$$s-$$t" != ${LANGPAIRSTR} ]; then \ echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ if [ "${USE_TARGET_LABELS}" == "1" ]; then \ - cut -f2,3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ + cut -f2,3 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ else \ - cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ + cut -f3 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ fi; \ - cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ + cut -f4 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ fi; \ - S=`cut -f1 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + S=`cut -f1 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ - T=`cut -f2 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + T=`cut -f2 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ if [ `echo "$$S $$T" | tr ' ' "\n" | wc -l` -gt 2 ]; then \ echo "extracting test sets for individual sub-language pairs!"; \ for a in $$S; do \ for b in $$T; do \ if [ "$$a-$$b" != ${LANGPAIRSTR} ]; then \ - if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src ]; then \ + if [ ! -e ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src ]; then \ echo "make ${TATOEBA_TESTSET}.$$a-$$b"; \ if [ "${USE_TARGET_LABELS}" == "1" ]; then \ - grep "$$a $$b " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + grep "$$a $$b " < ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ cut -f2,3 | sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ else \ - grep "$$a $$b " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + grep "$$a $$b " < ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ cut -f3 | sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ fi; \ - grep "$$a $$b " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + grep "$$a $$b " < ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ cut -f4 | sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.trg; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.trg; \ fi \ fi \ done \ done \ fi; \ else \ - wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ + wget -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ ${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \ - if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ - cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ + if [ -s ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ + cat ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ if [ "$$s-$$t" != ${LANGPAIRSTR} ]; then \ echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ if [ "${USE_TARGET_LABELS}" == "1" ]; then \ - cut -f1,4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ + cut -f1,4 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ else \ - cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ + cut -f4 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ fi; \ - cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ + cut -f3 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ fi; \ - S=`cut -f2 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + S=`cut -f2 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ - T=`cut -f1 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + T=`cut -f1 ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ if [ `echo "$$S $$T" | tr ' ' "\n" | wc -l` -gt 2 ]; then \ echo "extracting test sets for individual sub-language pairs!"; \ for a in $$S; do \ for b in $$T; do \ if [ "$$a-$$b" != ${LANGPAIRSTR} ]; then \ - if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src ]; then \ + if [ ! -e ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src ]; then \ echo "make ${TATOEBA_TESTSET}.$$a-$$b"; \ if [ "${USE_TARGET_LABELS}" == "1" ]; then \ - grep "$$b $$a " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + grep "$$b $$a " < ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ cut -f1,4 | sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ else \ - grep "$$b $$a " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + grep "$$b $$a " < ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ cut -f4 | sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.src; \ fi; \ - grep "$$b $$a " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ + grep "$$b $$a " < ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ cut -f3 | sed 's/^\([^ ]*\) />>\1<< /' \ - > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.trg; \ + > ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$a-$$b.trg; \ fi \ fi \ done \ @@ -1563,8 +1531,8 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets. fi; \ fi \ fi; \ - rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp; \ - rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ + rm -f ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp; \ + rm -f ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ done \ done if [ -d ${dir $@} ]; then \ @@ -1572,114 +1540,10 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets. fi - - - -# ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets-with-subsets.done: -# @for s in ${SRCLANGS}; do \ -# for t in ${TRGLANGS}; do \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ -# ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \ -# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ -# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ -# cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ -# if [ "${USE_TARGET_LABELS}" == "1" ]; then \ -# cut -f2,3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ -# sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# else \ -# cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# fi; \ -# cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ -# S=`cut -f1 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ -# T=`cut -f2 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ -# echo "languages found: $$S $$T"; \ -# if [ `echo "$$S $$T" | tr ' ' "\n" | wc -l` -gt 2 ]; then \ -# echo "extracting test sets for individual sub-language pairs!"; \ -# for a in $$S; do \ -# for b in $$T; do \ -# echo "make ${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b"; \ -# if [ "${USE_TARGET_LABELS}" == "1" ]; then \ -# grep "$$a $$b " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# cut -f2,3 | sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b.src; \ -# else \ -# grep "$$a $$b " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# cut -f3 | sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b.src; \ -# fi; \ -# grep "$$a $$b " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# cut -f4 | sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b.trg; \ -# done \ -# done \ -# fi; \ -# else \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ -# ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \ -# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ -# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ -# cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ -# if [ "${USE_TARGET_LABELS}" == "1" ]; then \ -# cut -f1,4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt | \ -# sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# else \ -# cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src; \ -# fi; \ -# cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ -# S=`cut -f2 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ -# T=`cut -f1 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ -# echo "languages found: $$S $$T"; \ -# if [ `echo "$$S $$T" | tr ' ' "\n" | wc -l` -gt 2 ]; then \ -# echo "extracting test sets for individual sub-language pairs!"; \ -# for a in $$S; do \ -# for b in $$T; do \ -# echo "make ${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b"; \ -# if [ "${USE_TARGET_LABELS}" == "1" ]; then \ -# grep "$$b $$a " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# cut -f1,4 | sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b.src; \ -# else \ -# grep "$$b $$a " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# cut -f4 | sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b.src; \ -# fi; \ -# grep "$$b $$a " < ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt |\ -# cut -f3 | sed 's/^\([^ ]*\) />>\1<< /' \ -# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.$$a-$$b.trg; \ -# done \ -# done \ -# fi; \ -# fi \ -# fi; \ -# rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp; \ -# rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ -# done \ -# done -# if [ -d ${dir $@} ]; then \ -# touch $@; \ -# fi - - - - - - ## TODO: ## get test sets for sublanguages in sets of macro-languages -${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets-langpairs.done: +${WORKHOME}/${LANGPAIRSTR}/test/Tatoeba-testsets-langpairs.done: @for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ done \ @@ -1688,45 +1552,19 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets-langpairs.done: -##---------------------------------------------------------------------------- -## TODO: we need some procedures to run evaluations -## for already released models -## the code below fails because of various dependencies etc ... -##---------------------------------------------------------------------------- - -RELEASED_TATOEBA_MODEL = fiu-cpp/opus-2021-02-18.zip -RELEASED_TATOEBA_SRC2TRG = $(subst -,2,$(subst /,,$(dir ${RELEASED_TATOEBA_MODEL}))) -RELEASED_TATOEBA_MODEL_URL = https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER}/${RELEASED_TATOEBA_MODEL} -EVAL_TATOEBA_WORKHOME = ${PWD}/work-eval -EVAL_TATOEBA_WORKDIR = ${EVAL_TATOEBA_WORKHOME}/$(dir ${RELEASED_TATOEBA_MODEL}) - -evaluate-released-tatoeba-model: - mkdir -p ${EVAL_TATOEBA_WORKDIR} - wget -O ${EVAL_TATOEBA_WORKHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL} - cd ${EVAL_TATOEBA_WORKDIR} && unzip -o $(notdir ${RELEASED_TATOEBA_MODEL}) - ${MAKE} TATOEBA_WORK=${EVAL_TATOEBA_WORKHOME} \ - DECODER_CONFIG=${EVAL_TATOEBA_WORKDIR}decoder.yml \ - MODEL_FINAL=`grep .npz ${EVAL_TATOEBA_WORKDIR}decoder.yml | sed 's/^ *- *//'` \ - SPMSRCMODEL=${EVAL_TATOEBA_WORKDIR}source.spm \ - SPMTRGMODEL=${EVAL_TATOEBA_WORKDIR}target.spm \ - tatoeba-${RELEASED_TATOEBA_SRC2TRG}-testsets - -##---------------------------------------------------------------------------- - - - - ############################################################################### ## generic targets for tatoba models ############################################################################### -.PHONY: tatoeba-langlabel-files -tatoeba-langlabel-files: ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src \ - ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.trg \ - ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-languages.src \ - ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-languages.trg +# ${TATOEBA_SRCLABELFILE} ${TATOEBA_TRGLABELFILE} -${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-languages.%: ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.% +.PHONY: tatoeba-langlabel-files langlabel-files +tatoeba-langlabel-files langlabel-files: ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src \ + ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg \ + ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-languages.src \ + ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-languages.trg + +${WORKHOME}/${LANGPAIRSTR}/${DATASET}-languages.%: ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.% mkdir -p ${dir $@} cat $< | tr ' ' "\n" | cut -f1 -d'_' | cut -f1 -d'-' | \ sed 's/ *$$//;s/^ *//' | tr "\n" ' ' > $@ @@ -1735,8 +1573,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-languages.%: ${TATOEBA_WORK}/${LANGPAI ## generic target for tatoeba challenge jobs %-tatoeba: ${TATOEBA_SRCLABELFILE} ${TATOEBA_TRGLABELFILE} ${MAKE} ${TATOEBA_LANGIDS_TRAINONLY} - ${MAKE} ${TATOEBA_PARAMS} \ - LANGPAIRSTR=${LANGPAIRSTR} \ + ${MAKE} LANGPAIRSTR=${LANGPAIRSTR} \ SRCLANGS="${shell cat ${word 1,$^}}" \ TRGLANGS="${shell cat ${word 2,$^}}" \ SRC=${SRC} TRG=${TRG} \ @@ -1744,8 +1581,8 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-languages.%: ${TATOEBA_WORK}/${LANGPAI ${@:-tatoeba=} -%-bttatoeba: ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src \ - ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.trg +%-bttatoeba: ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src \ + ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg for s in ${shell cat ${word 1,$^}}; do \ for t in ${shell cat ${word 2,$^}}; do \ echo "${MAKE} -C backtranslate \ @@ -1767,10 +1604,10 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-languages.%: ${TATOEBA_WORK}/${LANGPAI ## fetch data for all language combinations ## TODO: should we check whether we are supposed to skip some language pairs? -.PHONY: fetch-tatoeba-datasets -fetch-tatoeba-datasets: - -for s in ${SRCLANGS}; do \ - for t in ${TRGLANGS}; do \ +.PHONY: fetch-tatoeba-datasets fetch-datasets +fetch-datasets fetch-tatoeba-datasets: + -for s in ${sort ${SRC} ${SRCLANGS}}; do \ + for t in ${sort ${TRG} ${TRGLANGS}}; do \ if [ "$$s" \< "$$t" ]; then \ ${MAKE} SRCLANGS=$$s TRGLANGS=$$t \ ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$s-$$t.clean.$$s.gz; \ @@ -1786,7 +1623,7 @@ fetch-tatoeba-datasets: ## (each language pair may include several language variants) ## --> this is necessary to set the languages that are present in a model -${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src: +${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src: ${MAKE} fetch-tatoeba-datasets mkdir -p ${dir $@} for s in ${SRCLANGS}; do \ @@ -1821,7 +1658,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src: fi -${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.trg: ${TATOEBA_WORK}/${LANGPAIRSTR}/${DATASET}-langlabels.src +${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg: ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src if [ ! -e $@ ]; then rm $<; ${MAKE} $<; fi echo "done" @@ -1923,6 +1760,7 @@ ${TATOEBA_MONO}/%.labels: rm -fr $@.d +##------------------------------------------------------------------------------------------- ## convert Tatoeba Challenge data into the format we need ## - move the data into the right location with the suitable name ## - create devset if not given (part of training data) @@ -1930,172 +1768,143 @@ ${TATOEBA_MONO}/%.labels: ## (if there is more than one language pair in the collection) ## ## TODO: should we do some filtering like bitext-match, OPUS-filter ... +##------------------------------------------------------------------------------------------- -TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} +## relative directory within the data distributions of Tatoeba MT data files +TATOEBADATA = data/release/${TATOEBA_VERSION}/${LANGPAIR} +## fetch and convert the data and check whether we should extract +## sub-language pairs from the collection %/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz: - @mkdir -p $@.d - -wget -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar - -tar -C $@.d -xf $@.d/train.tar - @rm -f $@.d/train.tar - @if [ -e $@.d/${TATOEBA_TMPDATADIR}/test.src ]; then \ - echo "........ move test files to ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.*"; \ - mv $@.d/${TATOEBA_TMPDATADIR}/test.src ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.${SRCEXT}; \ - mv $@.d/${TATOEBA_TMPDATADIR}/test.trg ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.${TRGEXT}; \ - cat $@.d/${TATOEBA_TMPDATADIR}/test.id $(FIXLANGIDS) > ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.id; \ - fi - @if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ] && \ - [ `cat $@.d/${TATOEBA_TMPDATADIR}/dev.src | wc -l` -gt 50 ]; then \ - echo "........ move dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \ - mv $@.d/${TATOEBA_TMPDATADIR}/dev.src ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \ - mv $@.d/${TATOEBA_TMPDATADIR}/dev.trg ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \ - cat $@.d/${TATOEBA_TMPDATADIR}/dev.id $(FIXLANGIDS) > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \ - if [ -e $@.d/${TATOEBA_TMPDATADIR}/train.src.gz ]; then \ - echo "........ move train files to ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.*"; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.src.gz > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.id; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | cut -f1 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.domain; \ - fi; \ - else \ - if [ -e $@.d/${TATOEBA_TMPDATADIR}/train.src.gz ]; then \ - echo "........ too little devdata available - get top 1000 from training data!"; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.src.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | head -1000 | cut -f1 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.domain; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.src.gz | tail -n +1001 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | tail -n +1001 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.id; \ - ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f1 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.domain; \ - fi; \ - if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \ - echo "........ add dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \ - cat $@.d/${TATOEBA_TMPDATADIR}/dev.src >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \ - cat $@.d/${TATOEBA_TMPDATADIR}/dev.trg >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \ - cat $@.d/${TATOEBA_TMPDATADIR}/dev.id $(FIXLANGIDS) >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \ - fi \ - fi -## make sure that training data file exists even if it is empty - @if [ -e ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.${SRCEXT} ]; then \ - touch ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}; \ - touch ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}; \ - fi -####################################### -# save all lang labels that appear in the data -####################################### - @cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \ - grep -v '${SKIP_LANGIDS_PATTERN}' | \ - tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTSRCEXT}.labels) - @cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \ - grep -v '${SKIP_LANGIDS_PATTERN}' | \ - tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTTRGEXT}.labels) - @cat ${dir $@}Tatoeba-*.${LANGPAIR}.clean.domain | sort -u |\ - tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.domains) -####################################### -# cleanup temporary data -####################################### - @if [ -d $@.d/data ]; then \ - rm -f $@.d/${TATOEBA_TMPDATADIR}/*; \ - rmdir $@.d/${TATOEBA_TMPDATADIR}; \ - rmdir $@.d/data/release/${TATOEBA_VERSION}; \ - rmdir $@.d/data/release; \ - rmdir $@.d/data; \ - fi - @rm -f $@.d/train.tar - @rmdir $@.d -####################################### -# make data sets for individual -# language pairs from the Tatoeba data -####################################### - @if [ -e $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) ]; then \ - for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \ - for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \ + ${MAKE} $@.d/source.labels $@.d/target.labels + @if [ `cat $@.d/source.labels $@.d/target.labels | wc -w` -gt 2 ]; then \ + echo ".... found sublanguages in the data"; \ + b="$@.d/${TATOEBADATA}"; \ + for s in `cat $@.d/source.labels`; do \ + for t in `cat $@.d/target.labels`; do \ if [ "$$s" \< "$$t" ]; then \ - echo "extract $$s-$$t data"; \ + echo ".... extract $$s-$$t data"; \ for d in dev test train; do \ - if [ -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.id ]; then \ - paste ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.id \ - ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT} \ - ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT} |\ - grep -P "$$s\t$$t\t" | cut -f3,4 |\ - scripts/filter/filter-korean.sh ${SRC} ${TRG} $$d > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t; \ - if [ -s ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t ]; then \ - echo "........ make ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.*.gz"; \ - cut -f1 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.gz; \ - cut -f2 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.gz; \ - fi; \ - rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t; \ - fi \ - done \ - else \ - echo "extract $$t-$$s data"; \ - for d in dev test train; do \ - if [ -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.id ]; then \ - paste ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.id \ - ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT} \ - ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT} |\ - grep -P "$$s\t$$t\t" | cut -f3,4 |\ - scripts/filter/filter-korean.sh ${TRG} ${SRC} $$d > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s; \ - if [ -s ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s ]; then \ - echo "........ make ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.*.gz"; \ - cut -f1 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$t.gz; \ - cut -f2 ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s.clean.$$s.gz; \ - fi; \ - rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$t-$$s; \ - fi \ - done \ + paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.src.gz) <(gzip -cd $$b/$$d.trg.gz) | \ + grep -P "^$$s\t$$t\t" > $@.d/$$d; \ + if [ -s $@.d/$$d ]; then \ + cut -f1,2 $@.d/$$d | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz; \ + cut -f3 $@.d/$$d | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.gz; \ + cut -f4 $@.d/$$d | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.gz; \ + fi \ + done; \ + if [ -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz ]; then \ + paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | grep -P "^$$s\t$$t\t" | cut -f3 | \ + ${GZIP} -c > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz; \ + ${ZCAT} ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz |\ + sort -u > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domains; \ + echo "$$s" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.labels; \ + echo "$$t" >> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.labels; \ + fi \ fi \ done \ done \ fi -####################################### -# Finally, compress the big files with -# all the different language variants. -# If the code is the same as one of the -# variants then remove the file instead. -####################################### - @for d in dev test train; do \ - if [ -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT} ]; then \ - if [ ! -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \ - echo "........... compress ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}"; \ - ${GZIP} ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}; \ - else \ - rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}; \ - fi \ - fi; \ - if [ -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT} ]; then \ - if [ ! -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT}.gz ]; then \ - echo "........... compress ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT}"; \ - ${GZIP} ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT}; \ - else \ - rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${TRGEXT}; \ - fi \ - fi; \ - if [ -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain ]; then \ - if [ ! -e ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain.gz ]; then \ - ${GZIP} ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain; \ - else \ - rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain; \ - fi \ - fi; \ - rm -f ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.id; \ - done + @if [ ! -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \ + echo ".... move data files"; \ + b="$@.d/${TATOEBADATA}"; \ + for d in dev test train; do \ + mv $$b/$$d.src.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.gz; \ + mv $$b/$$d.trg.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.gz; \ + mv $$b/$$d.id.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.id.gz; \ + done; \ + ${ZCAT} $$b/train.domain.gz | sort -u | tr "\n" ' ' | sed 's/ *$$//' \ + > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domains; \ + mv $$b/train.domain.gz ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain.gz; \ + mv $@.d/source.labels ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels; \ + mv $@.d/target.labels ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels; \ + fi + @echo ".... cleanup of temporary files" + @rm -fr $@.d +## fetch data +%.gz.d/data.fetched: + @echo ".... fetch data (${LANGPAIR}.tar)" + @mkdir -p ${dir $@} + -wget -q -O ${dir $@}train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar + -tar -C ${dir $@} -xf ${dir $@}train.tar + @rm -f ${dir $@}train.tar + @touch $@ + + +## make dev data (extract additional examples from the training data if neccessary) +%.gz.d/devdata.created: %.gz.d/data.fetched + @if [ -e ${dir $@}/${TATOEBADATA}/dev.src ]; then \ + if [ `cat ${dir $@}/${TATOEBADATA}/dev.src | wc -l` -gt 50 ]; then \ + touch $@; \ + else \ + mv ${dir $@}/${TATOEBADATA}/dev.src $@.dev.src; \ + mv ${dir $@}/${TATOEBADATA}/dev.trg $@.dev.trg; \ + mv ${dir $@}/${TATOEBADATA}/dev.id $@.dev.id; \ + fi \ + fi + @if [ ! -e $@ ]; then \ + if [ -e ${dir $@}/${TATOEBADATA}/train.src.gz ]; then \ + echo "........ too little devdata available - get top 1000 from training data!"; \ + ${GZCAT} $@.d/${TATOEBADATA}/train.src.gz | head -1000 >> $@.dev.src; \ + ${GZCAT} $@.d/${TATOEBADATA}/train.trg.gz | head -1000 >> $@.dev.trg; \ + ${GZCAT} $@.d/${TATOEBADATA}/train.id.gz | head -1000 | cut -f2,3 >> $@.dev.id; \ + ${GZCAT} $@.d/${TATOEBADATA}/train.src.gz | tail -n +1001 | ${GZIP} -f > $@.src.gz; \ + ${GZCAT} $@.d/${TATOEBADATA}/train.trg.gz | tail -n +1001 | ${GZIP} -f > $@.trg.gz; \ + ${GZCAT} $@.d/${TATOEBADATA}/train.id.gz | tail -n +1001 | ${GZIP} -f > $@.id.gz; \ + mv $@.src.gz $@.d/${TATOEBADATA}/train.src.gz; \ + mv $@.trg.gz $@.d/${TATOEBADATA}/train.trg.gz; \ + mv $@.id.gz $@.d/${TATOEBADATA}/train.id.gz; \ + fi; \ + mv $@.dev.src ${dir $@}/${TATOEBADATA}/dev.src; \ + mv $@.dev.trg ${dir $@}/${TATOEBADATA}/dev.trg; \ + mv $@.dev.id ${dir $@}/${TATOEBADATA}/dev.id; \ + touch $@; \ + fi + +## fix language IDs and make sure that dev/test/train exist +%.gz.d/data.fixed: %.gz.d/devdata.created + @echo ".... fix language codes" + @if [ -e ${dir $@}/${TATOEBADATA}/train.id.gz ]; then \ + ${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f2,3 $(FIXLANGIDS) | ${GZIP} -c > ${dir $@}train.id.gz; \ + ${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f1 | ${GZIP} -c > ${dir $@}train.domain.gz; \ + mv ${dir $@}train.id.gz ${dir $@}train.domain.gz ${dir $@}${TATOEBADATA}/; \ + else \ + touch ${dir $@}${TATOEBADATA}/train.src ${dir $@}${TATOEBADATA}/train.trg; \ + touch ${dir $@}${TATOEBADATA}/train.id ${dir $@}${TATOEBADATA}/train.domain; \ + ${GZIP} -cd ${dir $@}${TATOEBADATA}/train.*; \ + fi + @touch ${dir $@}/${TATOEBADATA}/test.id ${dir $@}/${TATOEBADATA}/test.src ${dir $@}/${TATOEBADATA}/test.trg + @touch ${dir $@}/${TATOEBADATA}/dev.id ${dir $@}/${TATOEBADATA}/dev.src ${dir $@}/${TATOEBADATA}/dev.trg + @cat ${dir $@}${TATOEBADATA}/dev.id $(FIXLANGIDS) > ${dir $@}dev.id + @cat ${dir $@}${TATOEBADATA}/test.id $(FIXLANGIDS) > ${dir $@}test.id + @mv ${dir $@}dev.id ${dir $@}test.id ${dir $@}${TATOEBADATA}/ + @${GZIP} -f ${dir $@}${TATOEBADATA}/dev.* ${dir $@}${TATOEBADATA}/test.* + + +## get source language labels +%.gz.d/source.labels: %.gz.d/data.fixed + @${ZCAT} ${dir $@}${TATOEBADATA}/*.id.gz | cut -f1 | sort -u | \ + grep -v '${SKIP_LANGIDS_PATTERN}' | tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $@ + +## get target language labels +%.gz.d/target.labels: %.gz.d/data.fixed + @${ZCAT} ${dir $@}${TATOEBADATA}/*.id.gz | cut -f2 | sort -u | \ + grep -v '${SKIP_LANGIDS_PATTERN}' | tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $@ ## all the following data sets are created in the target of the #@ source language training data %/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}.gz: %/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz - echo "done!" + @echo "done!" %/${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}.gz %/${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}.gz: %/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}.gz - echo "done!" + @echo "done!" %/${TATOEBA_TESTSET}.${LANGPAIR}.clean.${SRCEXT}.gz %/${TATOEBA_TESTSET}.${LANGPAIR}.clean.${TRGEXT}.gz: %/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}.gz - echo "done!" + @echo "done!" @@ -2103,7 +1912,7 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} test-tune-data: make SRCEXT=bre TRGEXT=eng LANGPAIR=bre-eng \ - ${TATOEBA_WORK}-test/data/simple/Tatoeba-OpenSubtitles-train.bre-eng.clean.bre.gz + ${WORKHOME}-test/data/simple/Tatoeba-OpenSubtitles-train.bre-eng.clean.bre.gz ## TODO: should we split into train/dev/test @@ -2128,338 +1937,6 @@ testsets/${LANGPAIR}/${TATOEBA_TESTSET}.${LANGPAIR}.%: ${TATOEBA_DATA}/${TATOEBA -# ############################################################################### -# ## generate result tables -# ############################################################################### - -# TATOEBA_READMES = $(wildcard ${TATOEBA_MODELSHOME}/*/README.md) - -# # RESULT_MDTABLE_HEADER = | Model | Language Pair | Test Set | chrF2 | BLEU | BP | Reference Length |\n|:---|----|----|----:|---:|----:|---:|\n -# # ADD_MDHEADER = perl -pe '@a=split;print "\n${RESULT_MDTABLE_HEADER}" if ($$b ne $$a[1]);$$b=$$a[1];' - -# results/tatoeba-results-al%.md: tatoeba-results-al% -# mkdir -p ${dir $@} -# echo "# Tatoeba translation results" >$@ -# echo "" >>$@ -# echo "Note that some links to the actual models below are broken" >> $@ -# echo "because the models are not yet released or their performance is too poor" >> $@ -# echo "to be useful for anything." >> $@ -# echo "" >> $@ -# echo '| Model | Test Set | chrF2 | BLEU | BP | Reference Length |' >> $@ -# echo '|:--|---|--:|--:|--:|--:|' >> $@ -# grep -v '^model' $< | grep -v -- '----' | grep . | sort -k2,2 -k3,3 -k4,4nr |\ -# perl -pe '@a=split;print "| lang = $$a[1] | | | |\n" if ($$b ne $$a[1]);$$b=$$a[1];' |\ -# cut -f1,3- |\ -# perl -pe '/^(\S*)\/(\S*)\t/;if (-d "${TATOEBA_MODELSHOME}/$$1"){s/^(\S*)\/(\S*)\t/[$$1\/$$2](..\/models\/$$1)\t/;}' |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/;s/${TATOEBA_TESTSET}/tatoeba/' |\ -# sed 's/\(news[^ ]*\)-...... /\1 /;s/\(news[^ ]*\)-.... /\1 /;' >> $@ - -# # sed 's#^\([^ ]*\)/\([^ ]*\)#[\1/\2](../models/\1)#' |\ - - -# results/tatoeba-models-all.md: tatoeba-models-all -# mkdir -p ${dir $@} -# echo "# Tatoeba translation models" >$@ -# echo "" >>$@ -# echo "The scores refer to results on ${TATOEBA_TESTSET} data" >> $@ -# echo "For multilingual models, it is a mix of all language pairs" >> $@ -# echo "" >> $@ -# echo '| Model | chrF2 | BLEU | BP | Reference Length |' >> $@ -# echo '|:--|--:|--:|--:|--:|' >> $@ -# cut -f1,4- $< | \ -# perl -pe '/^(\S*)\/(\S*)\t/;if (-d "${TATOEBA_MODELSHOME}/$$1"){s/^(\S*)\/(\S*)\t/[$$1\/$$2](..\/models\/$$1)\t/;}' |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ - - -# ## update files in the workdir -# ## (to be included in the git repository) - -# ${TATOEBA_WORK}/tatoeba-results%: tatoeba-results% -# mkdir -p ${dir $@} -# -cat $@ > $@.old -# cp $< $@.new -# cat $@.old $@.new | sort | uniq > $@ -# rm -f $@.old $@.new - -# ${TATOEBA_WORK}/tatoeba-models-all: tatoeba-models-all -# mkdir -p ${dir $@} -# -cat $@ > $@.old -# cp $< $@.new -# cat $@.old $@.new | sort | uniq > $@ -# rm -f $@.old $@.new - -# ## get all results for all models and test sets -# tatoeba-results-all: ${TATOEBA_READMES} -# find ${TATOEBA_WORK} -name '*.eval' | sort | xargs grep chrF2 > $@.1 -# find ${TATOEBA_WORK} -name '*.eval' | sort | xargs grep BLEU > $@.2 -# cut -f3 -d '/' $@.1 | sed 's/^.*\.\([^\.]*\)\.\([^\.]*\)\.eval:.*$$/\1-\2/' > $@.langpair -# cut -f3 -d '/' $@.1 | sed 's/\.\([^\.]*\)\.spm.*$$//;s/${TATOEBA_TESTSET}[^ ]*/${TATOEBA_TESTSET}/' > $@.testset -# cut -f3 -d '/' $@.1 | sed 's/^.*\.\([^\.]*\)\.spm.*$$/\1/' > $@.dataset -# cut -f2 -d '/' $@.1 | sed 's/^.*\.\([^\.]*\)\.spm.*$$/\1/' > $@.modeldir -# cut -f2 -d '=' $@.1 | cut -f2 -d ' ' > $@.chrF2 -# cut -f2 -d '=' $@.2 | cut -f2 -d ' ' > $@.bleu -# cut -f3 -d '=' $@.2 | cut -f2 -d ' ' > $@.bp -# cut -f6 -d '=' $@.2 | cut -f2 -d ' ' | cut -f1 -d')' > $@.reflen -# paste -d'/' $@.modeldir $@.dataset > $@.model -# paste $@.model $@.langpair $@.testset $@.chrF2 $@.bleu $@.bp $@.reflen > $@ -# rm -f $@.model $@.langpair $@.testset $@.chrF2 $@.bleu $@.bp $@.reflen -# rm -f $@.modeldir $@.dataset $@.1 $@.2 - -# tatoeba-models-all: ${TATOEBA_READMES} -# find ${TATOEBA_WORK} -name '${TATOEBA_TESTSET}.opus*.eval' | sort | xargs grep chrF2 > $@.1 -# find ${TATOEBA_WORK} -name '${TATOEBA_TESTSET}.opus*.eval' | sort | xargs grep BLEU > $@.2 -# cut -f3 -d '/' $@.1 | sed 's/^.*\.\([^\.]*\)\.\([^\.]*\)\.eval:.*$$/\1-\2/' > $@.langpair -# cut -f3 -d '/' $@.1 | sed 's/\.\([^\.]*\)\.spm.*$$//;s/${TATOEBA_TESTSET}[^ ]*/${TATOEBA_TESTSET}/' > $@.testset -# cut -f3 -d '/' $@.1 | sed 's/^.*\.\([^\.]*\)\.spm.*$$/\1/' > $@.dataset -# cut -f2 -d '/' $@.1 | sed 's/^.*\.\([^\.]*\)\.spm.*$$/\1/' > $@.modeldir -# cut -f2 -d '=' $@.1 | cut -f2 -d ' ' > $@.chrF2 -# cut -f2 -d '=' $@.2 | cut -f2 -d ' ' > $@.bleu -# cut -f3 -d '=' $@.2 | cut -f2 -d ' ' > $@.bp -# cut -f6 -d '=' $@.2 | cut -f2 -d ' ' | cut -f1 -d')' > $@.reflen -# paste -d'/' $@.modeldir $@.dataset > $@.model -# paste $@.model $@.langpair $@.testset $@.chrF2 $@.bleu $@.bp $@.reflen > $@ -# rm -f $@.model $@.langpair $@.testset $@.chrF2 $@.bleu $@.bp $@.reflen -# rm -f $@.modeldir $@.dataset $@.1 $@.2 - -# ${TATOEBA_MODELSHOME}/released-models.txt: ${TATOEBA_READMES} -# -cat $@ > $@.old -# find ${TATOEBA_MODELSHOME}/ -name '*.eval.txt' | sort | xargs grep chrF2 > $@.1 -# find ${TATOEBA_MODELSHOME}/ -name '*.eval.txt' | sort | xargs grep BLEU > $@.2 -# cut -f3 -d '/' $@.1 | sed 's/\.eval.txt.*$$/.zip/' > $@.zip -# cut -f2 -d '/' $@.1 > $@.iso639-3 -# paste -d '/' $@.iso639-3 $@.zip | sed 's#^#${TATOEBA_DATAURL}/#' > $@.url -# cut -f2 -d '/' $@.1 | xargs iso639 -2 -k -p | tr ' ' "\n" > $@.iso639-1 -# cut -f2 -d '=' $@.1 | cut -f2 -d ' ' > $@.chrF2 -# cut -f2 -d '=' $@.2 | cut -f2 -d ' ' > $@.bleu -# cut -f3 -d '=' $@.2 | cut -f2 -d ' ' > $@.bp -# cut -f6 -d '=' $@.2 | cut -f2 -d ' ' | cut -f1 -d')' > $@.reflen -# cut -f2 -d '/' $@.1 | sed 's/^\([^ \-]*\)$$/\1-\1/g' | tr '-' ' ' | \ -# xargs iso639 -k | sed 's/$$/ /' |\ -# sed -e 's/\" \"\([^\"]*\)\" /\t\1\n/g' | sed 's/^\"//g' > $@.langs -# paste $@.url $@.iso639-3 $@.iso639-1 $@.chrF2 $@.bleu $@.bp $@.reflen $@.langs > $@ -# rm -f $@.url $@.iso639-3 $@.iso639-1 $@.chrF2 $@.bleu $@.bp $@.reflen $@.1 $@.2 $@.langs $@.zip -# cat $@.old $@.new | sort | uniq > $@ -# rm -f $@.old $@.new - -# ${TATOEBA_MODELSHOME}/released-model-results.txt: ${TATOEBA_READMES} -# -cat $@ > $@.old -# find ${TATOEBA_MODELSHOME}/ -name 'README.md' | sort | \ -# xargs egrep -h '^(# |\| ${TATOEBA_TESTSET}|\* download:)' |\ -# tr "\t" " " | tr "\n" "\t" | sed "s/# /\n# /g" |\ -# perl -e 'while (<>){s/^.*\((.*)\)/\1/;@_=split(/\t/);$$m=shift(@_);for (@_){print "$$_\t$$m\n";}}' |\ -# grep -v '.multi.' |\ -# sed -e 's/${TATOEBA_TESTSET}.\S*\(...\....\) /\1/' |\ -# grep '^|' |\ -# sed -e 's/ *| */\t/g' | cut -f2,3,4,6 > $@.new -# cat $@.old $@.new | sort -k1,1 -k3,3nr -k2,2nr -k4,4 | uniq > $@ -# rm -f $@.old $@.new - -# ## new: also consider the opposite translation direction! -# tatoeba-results-all-subset-%: tatoeba-%.md tatoeba-results-all-sorted-langpair -# ( l="${shell grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | sort -u | tr "\n" '|' | sed 's/|$$//;s/\-/\\\-/g'}"; \ -# r="${shell grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | sort -u | sed 's/\(...\)-\(...\)/\2-\1/' | tr "\n" '|' | sed 's/|$$//;s/\-/\\\-/g'}"; \ -# grep -P "$$l|$$r" ${word 2,$^} |\ -# perl -pe '@a=split;print "\n${RESULT_TABLE_HEADER}" if ($$b ne $$a[1]);$$b=$$a[1];' > $@ ) - -# tatoeba-results-all-langgroup: tatoeba-results-all -# grep -P "${subst ${SPACE},-eng|,${OPUS_LANG_GROUPS}}-eng" $< >> $@ -# grep -P "eng-${subst ${SPACE},|eng-,${OPUS_LANG_GROUPS}}" $< >> $@ -# grep -P "`echo '${OPUS_LANG_GROUPS}' | sed 's/\([^ ][^ ]*\)/\1-\1/g;s/ /\|/g'`" $< >> $@ - - -# RESULT_TABLE_HEADER=model\tlanguage-pair\ttestset\tchrF2\tBLEU\tBP\treference-length\n--------------------------------------------------------------------------\n - -# tatoeba-results-all-sorted-langpair: tatoeba-results-all -# sort -k2,2 -k3,3 -k4,4nr < $< |\ -# perl -pe '@a=split;print "\n${RESULT_TABLE_HEADER}" if ($$b ne $$a[1]);$$b=$$a[1];' \ -# > $@ - -# tatoeba-results-all-sorted-chrf2: tatoeba-results-all -# sort -k3,3 -k4,4nr < $< > $@ - -# tatoeba-results-all-sorted-bleu: tatoeba-results-all -# sort -k3,3 -k5,5nr < $< > $@ - - - - -# ############# -# ## OLD ones -# ############# - - -# results/tatoeba-results-langgroup.md: tatoeba-results-langgroup -# mkdir -p ${dir $@} -# echo "# Tatoeba translation results" > $@ -# echo "" >> $@ -# echo "Multilingual models for language groups according to ISO639-5." >> $@ -# echo "" >> $@ -# echo "Note that some links to the actual models below are broken" >> $@ -# echo "because the models are not yet released or their performance is too poor" >> $@ -# echo "to be useful for anything." >> $@ -# echo "" >> $@ -# echo "| Source | Target | Model | Test Set | chrF2 | BLEU |" >> $@ -# echo "|--------|--------|------:|---------------|------:|-----:|" >> $@ -# grep multi $< | cut -f1 | xargs iso639 -p | tr '"' "\n" | \ -# grep [a-z] | \ -# sed 's/based\-/based | /' |\ -# sed 's/languages\-/languages | /' |\ -# sed 's/English\-/English | /;s/^/| /;s/$$/ /' > $@.langpair -# grep multi $< |\ -# sed 's#^\([^ ]*\)#[\1](../models/\1)#' |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/' > $@.rest -# paste $@.langpair $@.rest -d ' ' >> $@ -# echo "" >> $@ -# echo "## Performance on individual language pairs" >> $@ -# echo "" >> $@ -# echo "Note that some of the test sets are way too small to be reliable!" >> $@ -# echo "" >> $@ -# echo "| Source | Target | Model | Test Set | chrF2 | BLEU |" >> $@ -# echo "|--------|--------|------:|---------------|------:|-----:|" >> $@ -# grep -v multi $< | cut -f1 | xargs iso639 -p | tr '"' "\n" | \ -# grep [a-z] | \ -# sed 's/based\-/based | /' |\ -# sed 's/languages\-/languages | /' |\ -# sed 's/English\-/English | /;s/^/| /;s/$$/ /' > $@.langpair -# grep -v multi $< |\ -# sed 's#^\([^ ]*\)#[\1](../models/\1)#' |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/' > $@.rest -# paste $@.langpair $@.rest -d ' ' >> $@ -# rm -f $@.langpair $@.rest - - -# results/tatoeba-results-%.md: tatoeba-results-% tatoeba-results-BLEU-sorted-model -# mkdir -p ${dir $@} -# echo "# Tatoeba translation results" >$@ -# echo "" >>$@ -# echo "Note that some links to the actual models below are broken" >>$@ -# echo "because the models are not yet released or their performance is too poor" >> $@ -# echo "to be useful for anything." >> $@ -# echo "" >>$@ -# echo "| Model | Test Set | chrF2 | BLEU |" >> $@ -# echo "|----------------------:|------------|-----------:|---------:|" >> $@ -# ( p=`grep -P 'ref_len = 1?[0-9]?[0-9]\)' tatoeba-results-BLEU-sorted-model | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'`; \ -# grep -v -P "\t($$p)\t" $< |\ -# sed 's#^\([^ ]*\)#[\1](../models/\1)#' |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ ) - - - -# results/tatoeba-results-chrF2%.md: tatoeba-results-chrF2% tatoeba-results-BLEU-sorted-model -# mkdir -p ${dir $@} -# echo "# Tatoeba translation results" >$@ -# echo "" >>$@ -# echo "| Model | Test Set | chrF2 |" >> $@ -# echo "|-----------------:|------------|-----------:|" >> $@ -# ( p=`grep -P 'ref_len = 1?[0-9]?[0-9]\)' tatoeba-results-BLEU-sorted-model | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'`; \ -# grep -v -P "\t($$p)\t" $< |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ ) - -# results/tatoeba-results-BLEU%.md: tatoeba-results-BLEU% tatoeba-results-BLEU-sorted-model -# mkdir -p ${dir $@} -# echo "# Tatoeba translation results" >$@ -# echo "" >>$@ -# echo "| Model | Test Set | BLEU | Details |" >> $@ -# echo "|-----------------:|------------|-----------:|---------:|" >> $@ -# ( p=`grep -P 'ref_len = 1?[0-9]?[0-9]\)' tatoeba-results-BLEU-sorted-model | cut -f2 | sort -u | tr "\n" '|' | sed 's/|$$//'`; \ -# grep -v -P "\t($$p)\t" $< |\ -# sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ ) - -# tatoeba-results-sorted: -# grep chrF2 ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' > $@.1 -# grep BLEU ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# cut -f3 -d' ' > $@.2 -# paste $@.1 $@.2 | sort -k3,3nr > $@ -# rm -f $@.1 $@.2 - -# ## results with chrF and BLEU scores sorted by language pair -# tatoeba-results-sorted-langpair: -# grep chrF2 ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' > $@.1 -# grep BLEU ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# cut -f3 -d' ' > $@.2 -# paste $@.1 $@.2 | sort -k2,2 -k3,3nr > $@ -# rm -f $@.1 $@.2 - -# tatoeba-results-sorted-model: -# grep chrF2 ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' > $@.1 -# grep BLEU ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# cut -f3 -d' ' > $@.2 -# paste $@.1 $@.2 | sort -k1,1 -k3,3nr > $@ -# rm -f $@.1 $@.2 - -# tatoeba-results-BLEU-sorted: -# grep BLEU ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' |sort -k3,3nr | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | grep -v eval > $@ - -# tatoeba-results-BLEU-sorted-model: -# grep BLEU ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \ -# grep -v eval | sort -k1,1 -k3,3nr > $@ - -# tatoeba-results-BLEU-sorted-langpair: -# grep BLEU ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \ -# grep -v eval | sort -k2,2 -k3,3nr > $@ - -# tatoeba-results-chrF2-sorted: -# grep chrF2 ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' |sort -k3,3nr | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' > $@ - -# tatoeba-results-chrF2-sorted-model: -# grep chrF2 ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/chrF.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' | sort -k1,1 -k3,3nr > $@ - -# tatoeba-results-chrF2-sorted-langpair: -# grep chrF2 ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.*eval | \ -# sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \ -# sed 's/Tatoeba.*\(transformer-align\.\|transformer\.\)/\./' | \ -# sed "s#/.#\t#" | \ -# sed 's#.eval: = #\t#' | sort -k2,2 -k3,3nr > $@ - -# ## scores per subset -# tatoeba-results-subset-%: tatoeba-%.md tatoeba-results-sorted-langpair -# ( l="${shell grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | sort -u | tr "\n" '|' | tr '-' '.' | sed 's/|$$//;s/\./\\\./g'}"; \ -# grep -P "$$l" ${word 2,$^} > $@ ) - -# tatoeba-results-langgroup: tatoeba-results-sorted-model -# grep -P "${subst ${SPACE},-eng|,${OPUS_LANG_GROUPS}}-eng" $< >> $@ -# grep -P "eng-${subst ${SPACE},|eng-,${OPUS_LANG_GROUPS}}" $< >> $@ -# grep -P "`echo '${OPUS_LANG_GROUPS}' | sed 's/\([^ ][^ ]*\)/\1-\1/g;s/ /\|/g'`" $< >> $@ - - - - - - - - @@ -2468,7 +1945,34 @@ testsets/${LANGPAIR}/${TATOEBA_TESTSET}.${LANGPAIR}.%: ${TATOEBA_DATA}/${TATOEBA ############################################################################### -WRONGFILES = ${patsubst %.eval,%,${wildcard ${TATOEBA_WORK}/*/${TATOEBA_TESTSET}.opus*.eval}} +##---------------------------------------------------------------------------- +## TODO: we need some procedures to run evaluations +## for already released models +## the code below fails because of various dependencies etc ... +## --> moved evaluation to sub-dir eval!!! +##---------------------------------------------------------------------------- + +RELEASED_TATOEBA_MODEL = fiu-cpp/opus-2021-02-18.zip +RELEASED_TATOEBA_SRC2TRG = $(subst -,2,$(subst /,,$(dir ${RELEASED_TATOEBA_MODEL}))) +RELEASED_TATOEBA_MODEL_URL = https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER}/${RELEASED_TATOEBA_MODEL} +EVAL_WORKHOMEHOME = ${PWD}/work-eval +EVAL_WORKHOMEDIR = ${EVAL_WORKHOMEHOME}/$(dir ${RELEASED_TATOEBA_MODEL}) + +evaluate-released-tatoeba-model: + mkdir -p ${EVAL_WORKHOMEDIR} + wget -O ${EVAL_WORKHOMEHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL} + cd ${EVAL_WORKHOMEDIR} && unzip -o $(notdir ${RELEASED_TATOEBA_MODEL}) + ${MAKE} WORKHOME=${EVAL_WORKHOMEHOME} \ + DECODER_CONFIG=${EVAL_WORKHOMEDIR}decoder.yml \ + MODEL_FINAL=`grep .npz ${EVAL_WORKHOMEDIR}decoder.yml | sed 's/^ *- *//'` \ + SPMSRCMODEL=${EVAL_WORKHOMEDIR}source.spm \ + SPMTRGMODEL=${EVAL_WORKHOMEDIR}target.spm \ + tatoeba-${RELEASED_TATOEBA_SRC2TRG}-testsets + +##---------------------------------------------------------------------------- + + +WRONGFILES = ${patsubst %.eval,%,${wildcard ${WORKHOME}/*/${TATOEBA_TESTSET}.opus*.eval}} move-wrong: for f in ${WRONGFILES}; do \ @@ -2487,16 +1991,16 @@ move-wrong: remove-old-groupeval: for g in ${OPUS_LANG_GROUPS}; do \ - rm -f ${TATOEBA_WORK}/$$g-eng/${TATOEBA_TESTSET}.${TATOEBA_DATASET}.spm32k-spm32k1.transformer.???.eng*; \ - rm -f ${TATOEBA_WORK}/eng-$$g/${TATOEBA_TESTSET}.${TATOEBA_DATASET}.spm32k-spm32k1.transformer.eng.???; \ - rm -f ${TATOEBA_WORK}/eng-$$g/${TATOEBA_TESTSET}.${TATOEBA_DATASET}.spm32k-spm32k1.transformer.eng.???.*; \ + rm -f ${WORKHOME}/$$g-eng/${TATOEBA_TESTSET}.${TATOEBA_DATASET}.spm32k-spm32k1.transformer.???.eng*; \ + rm -f ${WORKHOME}/eng-$$g/${TATOEBA_TESTSET}.${TATOEBA_DATASET}.spm32k-spm32k1.transformer.eng.???; \ + rm -f ${WORKHOME}/eng-$$g/${TATOEBA_TESTSET}.${TATOEBA_DATASET}.spm32k-spm32k1.transformer.eng.???.*; \ done remove-old-group: for g in ${OPUS_LANG_GROUPS}; do \ - if [ -e ${TATOEBA_WORK}/$$g-eng ]; then mv ${TATOEBA_WORK}/$$g-eng ${TATOEBA_WORK}/$$g-eng-old3; fi; \ - if [ -e ${TATOEBA_WORK}/eng-$$g ]; then mv ${TATOEBA_WORK}/eng-$$g ${TATOEBA_WORK}/eng-$$g-old3; fi; \ + if [ -e ${WORKHOME}/$$g-eng ]; then mv ${WORKHOME}/$$g-eng ${WORKHOME}/$$g-eng-old3; fi; \ + if [ -e ${WORKHOME}/eng-$$g ]; then mv ${WORKHOME}/eng-$$g ${WORKHOME}/eng-$$g-old3; fi; \ done @@ -2505,22 +2009,22 @@ remove-old-group: ## resume training for all bilingual models that are not yet converged .PHONY: tatoeba-resume-all tatoeba-continue-all tatoeba-resume-all tatoeba-continue-all: - for l in `find ${TATOEBA_WORK}/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \ + for l in `find ${WORKHOME}/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \ s=`echo $$l | cut -f1 -d'-'`; \ t=`echo $$l | cut -f2 -d'-'`; \ if [ -d ${HOME}/research/Tatoeba-Challenge/data/$$s-$$t ] || \ [ -d ${HOME}/research/Tatoeba-Challenge/data/$$t-$$s ]; then \ - if [ -d ${TATOEBA_WORK}/$$l ]; then \ - if [ ! `find ${TATOEBA_WORK}/$$l/ -maxdepth 1 -name '*.done' | wc -l` -gt 0 ]; then \ - if [ `find ${TATOEBA_WORK}/$$l/ -maxdepth 1 -name '*.npz' | wc -l` -gt 0 ]; then \ - echo "resume ${TATOEBA_WORK}/$$l"; \ + if [ -d ${WORKHOME}/$$l ]; then \ + if [ ! `find ${WORKHOME}/$$l/ -maxdepth 1 -name '*.done' | wc -l` -gt 0 ]; then \ + if [ `find ${WORKHOME}/$$l/ -maxdepth 1 -name '*.npz' | wc -l` -gt 0 ]; then \ + echo "resume ${WORKHOME}/$$l"; \ make SRCLANGS=$$s TRGLANGS=$$t all-job-tatoeba; \ else \ - echo "resume ${TATOEBA_WORK}/$$l"; \ + echo "resume ${WORKHOME}/$$l"; \ make SRCLANGS=$$s TRGLANGS=$$t tatoeba-job; \ fi \ else \ - echo "done ${TATOEBA_WORK}/$$l"; \ + echo "done ${WORKHOME}/$$l"; \ fi \ fi \ fi \ @@ -2530,18 +2034,18 @@ tatoeba-resume-all tatoeba-continue-all: ## make release package for all bilingual models that are converged .PHONY: tatoeba-dist-all tatoeba-dist-all: - for l in `find ${TATOEBA_WORK}/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \ + for l in `find ${WORKHOME}/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \ s=`echo $$l | cut -f1 -d'-'`; \ t=`echo $$l | cut -f2 -d'-'`; \ if [ -d ${HOME}/research/Tatoeba-Challenge/data/$$s-$$t ] || \ [ -d ${HOME}/research/Tatoeba-Challenge/data/$$t-$$s ]; then \ - if [ -d ${TATOEBA_WORK}/$$l ]; then \ - if [ `find ${TATOEBA_WORK}/$$l/ -maxdepth 1 -name '*transformer-align.model1.done' | wc -l` -gt 0 ]; then \ - echo "make release for ${TATOEBA_WORK}/$$l"; \ + if [ -d ${WORKHOME}/$$l ]; then \ + if [ `find ${WORKHOME}/$$l/ -maxdepth 1 -name '*transformer-align.model1.done' | wc -l` -gt 0 ]; then \ + echo "make release for ${WORKHOME}/$$l"; \ make SRCLANGS=$$s TRGLANGS=$$t MODELTYPE=transformer-align release-tatoeba; \ fi; \ - if [ `find ${TATOEBA_WORK}/$$l/ -maxdepth 1 -name '*transformer.model1.done' | wc -l` -gt 0 ]; then \ - echo "make release for ${TATOEBA_WORK}/$$l"; \ + if [ `find ${WORKHOME}/$$l/ -maxdepth 1 -name '*transformer.model1.done' | wc -l` -gt 0 ]; then \ + echo "make release for ${WORKHOME}/$$l"; \ make SRCLANGS=$$s TRGLANGS=$$t MODELTYPE=transformer release-tatoeba; \ fi; \ fi \ @@ -2551,23 +2055,23 @@ tatoeba-dist-all: fixlabels.sh: - for l in `find ${TATOEBA_WORK}-old/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \ + for l in `find ${WORKHOME}-old/ -maxdepth 1 -mindepth 1 -type d -printf '%f '`; do \ s=`echo $$l | cut -f1 -d'-'`; \ t=`echo $$l | cut -f2 -d'-'`; \ if [ -d ${HOME}/research/Tatoeba-Challenge/data/$$s-$$t ] || \ [ -d ${HOME}/research/Tatoeba-Challenge/data/$$t-$$s ]; then \ - if [ -d ${TATOEBA_WORK}/$$l ]; then \ - echo "# ${TATOEBA_WORK}/$$l exists --- skip it!" >> $@; \ - echo "mv ${TATOEBA_WORK}-old/$$l ${TATOEBA_WORK}-double/$$l" >> $@; \ + if [ -d ${WORKHOME}/$$l ]; then \ + echo "# ${WORKHOME}/$$l exists --- skip it!" >> $@; \ + echo "mv ${WORKHOME}-old/$$l ${WORKHOME}-double/$$l" >> $@; \ else \ ${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-labels; \ - o=`grep '*' ${TATOEBA_WORK}-old/$$l/train/README.md | cut -f1 -d: | grep '-' | sed 's/\* //g' | cut -f1 -d- | sort -u | tr "\n" ' '`; \ - O=`grep '*' ${TATOEBA_WORK}-old/$$l/train/README.md | cut -f1 -d: | grep '-' | sed 's/\* //g' | cut -f2 -d- | sort -u | tr "\n" ' '`; \ - n=`cat ${TATOEBA_WORK}/data/simple/${TATOEBA_TRAINSET}.$$l.clean.$$s.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \ - N=`cat ${TATOEBA_WORK}/data/simple/${TATOEBA_TRAINSET}.$$l.clean.$$t.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \ + o=`grep '*' ${WORKHOME}-old/$$l/train/README.md | cut -f1 -d: | grep '-' | sed 's/\* //g' | cut -f1 -d- | sort -u | tr "\n" ' '`; \ + O=`grep '*' ${WORKHOME}-old/$$l/train/README.md | cut -f1 -d: | grep '-' | sed 's/\* //g' | cut -f2 -d- | sort -u | tr "\n" ' '`; \ + n=`cat ${WORKHOME}/data/simple/${TATOEBA_TRAINSET}.$$l.clean.$$s.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \ + N=`cat ${WORKHOME}/data/simple/${TATOEBA_TRAINSET}.$$l.clean.$$t.labels | tr ' ' "\n" | sort | grep . | tr "\n" ' '`; \ if [ "$$o" != "$$n" ] || [ "$$O" != "$$N" ] ; then \ echo "# labels in $$l are different ($$o / $$O - $$n / $$N)" >> $@; \ - if [ -d ${TATOEBA_WORK}-old/$$l ]; then \ + if [ -d ${WORKHOME}-old/$$l ]; then \ if [ "$$n" != " " ] && [ "$$n" != "" ]; then \ if [ "$$N" != " " ] && [ "$$N" != "" ]; then \ echo "# re-run $$l from scratch!" >> $@; \ @@ -2576,8 +2080,8 @@ fixlabels.sh: fi \ fi; \ else \ - if [ -d ${TATOEBA_WORK}-old/$$l ]; then \ - echo "mv ${TATOEBA_WORK}-old/$$l ${TATOEBA_WORK}/$$l" >> $@; \ + if [ -d ${WORKHOME}-old/$$l ]; then \ + echo "mv ${WORKHOME}-old/$$l ${WORKHOME}/$$l" >> $@; \ fi; \ fi; \ fi \ @@ -2586,27 +2090,27 @@ fixlabels.sh: tatoeba-missing-test: - for d in `find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \ - if [ ! -e ${TATOEBA_WORK}/$$d/test/${TATOEBA_TESTSET}.src ]; then \ - if [ `find ${TATOEBA_WORK}/$$d/train -name '*-model' | wc -l` -gt 0 ]; then \ + for d in `find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \ + if [ ! -e ${WORKHOME}/$$d/test/${TATOEBA_TESTSET}.src ]; then \ + if [ `find ${WORKHOME}/$$d/train -name '*-model' | wc -l` -gt 0 ]; then \ p=`echo $$d | sed 's/-/2/'`; \ echo "missing eval file for $$d"; \ - mkdir -p ${TATOEBA_WORK}-tmp/$$d/train; \ - rsync -av ${TATOEBA_WORK}/$$d/train/*model* ${TATOEBA_WORK}-tmp/$$d/train/; \ - make FIT_DATA_SIZE=1000 LANGGROUP_FIT_DATA_SIZE=1000 TATOEBA_WORK=${TATOEBA_WORK}-tmp tatoeba-$$p-data; \ - cp ${TATOEBA_WORK}-tmp/$$d/test/${TATOEBA_TESTSET}.* ${TATOEBA_WORK}/$$d/test/; \ - rm -fr ${TATOEBA_WORK}-tmp/$$d; \ + mkdir -p ${WORKHOME}-tmp/$$d/train; \ + rsync -av ${WORKHOME}/$$d/train/*model* ${WORKHOME}-tmp/$$d/train/; \ + make FIT_DATA_SIZE=1000 LANGGROUP_FIT_DATA_SIZE=1000 WORKHOME=${WORKHOME}-tmp tatoeba-$$p-data; \ + cp ${WORKHOME}-tmp/$$d/test/${TATOEBA_TESTSET}.* ${WORKHOME}/$$d/test/; \ + rm -fr ${WORKHOME}-tmp/$$d; \ fi \ fi \ done tatoeba-touch-test: - for d in `find ${TATOEBA_WORK}/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \ - if [ -e ${TATOEBA_WORK}/$$d/test/${TATOEBA_TESTSET}.src ]; then \ - if [ -e ${TATOEBA_WORK}/$$d/val/${TATOEBA_DEVSET}.src ]; then \ - touch -r ${TATOEBA_WORK}/$$d/val/${TATOEBA_DEVSET}.src ${TATOEBA_WORK}/$$d/test/${TATOEBA_TESTSET}.src*; \ - touch -r ${TATOEBA_WORK}/$$d/val/${TATOEBA_DEVSET}.src ${TATOEBA_WORK}/$$d/test/${TATOEBA_TESTSET}.trg*; \ + for d in `find ${WORKHOME}/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \ + if [ -e ${WORKHOME}/$$d/test/${TATOEBA_TESTSET}.src ]; then \ + if [ -e ${WORKHOME}/$$d/val/${TATOEBA_DEVSET}.src ]; then \ + touch -r ${WORKHOME}/$$d/val/${TATOEBA_DEVSET}.src ${WORKHOME}/$$d/test/${TATOEBA_TESTSET}.src*; \ + touch -r ${WORKHOME}/$$d/val/${TATOEBA_DEVSET}.src ${WORKHOME}/$$d/test/${TATOEBA_TESTSET}.trg*; \ fi \ fi \ done diff --git a/tatoeba/back-translate/Makefile b/tatoeba/back-translate/Makefile new file mode 100644 index 00000000..40578a06 --- /dev/null +++ b/tatoeba/back-translate/Makefile @@ -0,0 +1,624 @@ +# +# backtranslate wiki data with Tatoeba-MT challenge data +# +# only works with sentencepiece models! +# + +PWD := ${shell pwd} +REPOHOME := ${PWD}/../../ +TOOLSDIR := ${REPOHOME}tools + +include ${REPOHOME}lib/env.mk +include ${REPOHOME}lib/config.mk +include ${REPOHOME}lib/slurm.mk + + +SRC = fin +TRG = eng + + +## TODO: should use unshuffled versions and split into individual languages +## ---> otherwise we don't know the input language in case there are multiple ones + +TATOEBA_RELEASE = v2020-07-28 +TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-${TATOEBA_RELEASE} +TATOEBA_WIKI_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled +TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt +TATOEBA_RELEASED_ALL = ${TATOEBA_GITRAW}/models/released-model-results-all.txt +TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt +TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models + + + + +## container for storing backtranslations +BT_CONTAINER = Tatoeba-MT-bt +BT_CWORK_ONTAINER = project-Tatoeba-MT-bt + +## various sources are available +## can be general wikipedia, wikinews, wikibooks, ... +WIKISOURCE ?= wikipedia +# WIKISOURCE ?= wiki + +## split size in nr-of-lines +## default part to be selected = aa +SPLIT_SIZE ?= 1000000 + + +## maximum input length (number sentence piece segments) +## maximum number of sentences to be translated (top N lines) +MAX_LENGTH ?= 100 +MAX_SENTENCES ?= ${SPLIT_SIZE} + + +LANGPAIR = ${SRC}-${TRG} + +PWD := $(shell pwd) + + + +# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar +MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4} +MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} +MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} + +MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} +ifneq (${MULTI_TARGET_MODEL},0) + TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} +endif + + +## macro-language IDs +## TODO: need to do something better than hard-coding this here +TATOEBA_MACRO_LANGS = hbs nor msa + + +## target languages of reliable models for current source language +## reliable is defined as BLEU scores above 20.0 +## +TATOEBA_RELIABLE_TRG_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \ + egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f2 -d-} + +## alternative: chr-F2 >= 0.4 +TATOEBA_RELIABLE_TRG_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \ + egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f2 -d-} + +## accept both +TATOEBA_RELIABLE_TRG = $(filter-out ${TATOEBA_MACRO_LANGS},$(sort ${TATOEBA_RELIABLE_TRG_BLEU} ${TATOEBA_RELIABLE_TRG_CHRF})) + + +##################################################################################### +#### TODO: find wiki languages that we can translate +#### PROBLEM: a wiki release may include several languages (like hbs, nor, ...) +##################################################################################### + +## all "reliable" released tanslation models +# TATOEBA_AVAILABLE_NMT := ${shell wget -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u} + +TATOEBA_RELIABLE_SRC_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \ + egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f1 -d-} + +TATOEBA_RELIABLE_SRC_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \ + egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f1 -d-} + +TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SRC_CHRF}) + + +## TODO: is it OK to turn zho into cmn? +## NOTE: also needs to fix the grep pattern in recipe for ${WIKI_DIR}/${SRC} !!!! +TATOEBA_WIKILANGS := ${shell wget -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \ + cut -f2 | sed 's/zho/cmn/' | sed 's/nor.*/nob/' | sort -u } + +TATOEBA_TRANSLATABLE_WIKILANGS := ${filter ${TATOEBA_RELIABLE_SRC},${TATOEBA_WIKILANGS}} +TATOEBA_TRANSLATABLE_WIKILANGS3 := ${sort ${shell iso639 -m -n ${TATOEBA_TRANSLATABLE_WIKILANGS}}} + +print-wikilangs: + @echo ${TATOEBA_RELIABLE_TRG} + +# @echo ${TATOEBA_RELIABLE_SRC} +# @echo ${TATOEBA_TRANSLATABLE_WIKILANGS} +# @echo ${TATOEBA_TRANSLATABLE_WIKILANGS3} + + +##################################################################################### +##################################################################################### +##################################################################################### + + +### OBSOLETE?? +## languages of released wikis +RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \ + grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'}) + +## reverse list +RELEASED_WIKIS_REV = ${shell (for d in ${RELEASED_WIKIS}; do echo $$d; done) | tac} + + + + + + +WIKI_DIR = ${PWD}/wiki +LANGID = ${SRC} +PART = aa +OUTPUT_DIR = ${LANGPAIR} +WIKI_TXT = ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.${PART}.gz +WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz +WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz +WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz + +WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz +WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz +WIKI_LATEST_README = ${OUTPUT_DIR}/latest/README.md + +## all parts of this wiki +PARTS = ${sort ${patsubst ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,%,\ + ${wildcard ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.??.gz}}} + +# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary +WIKISOURCES = ${sort $(patsubst %.txt.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.txt.gz})) \ + $(patsubst %.${PART}.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.${PART}.gz}))} + + +## targets for all parts of the current wiki source + +ALLWIKIPARTS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,${PARTS}} +ALLWIKIPARTS_SRC = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${PARTS}} +ALLWIKIPARTS_PRE = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${PARTS}} +ALLWIKIPARTS_TRG = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${PARTS}} + +ALLWIKIPARTS_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${SRC}.gz,${PARTS}} +ALLWIKIPARTS_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${TRG}.gz,${PARTS}} + + +## all wiki sources for the selected part + +ALLWIKIS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/%.${PART}.gz,${WIKISOURCES}} +ALLWIKIS_SRC = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${WIKISOURCES}} +ALLWIKIS_PRE = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${WIKISOURCES}} +ALLWIKIS_TRG = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${WIKISOURCES}} + + + +## don't delete translated text if the process crashes +.PRECIOUS: ${WIKI_TRG} + + +ifdef LOCAL_SCRATCH + TMPDIR = ${LOCAL_SCRATCH} +endif + +ifeq (${shell hostname --domain 2>/dev/null},bullx) + LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \ + module load nlpl-udpipe nlpl-opus && +endif + + +.PHONY: all +all: translate + +all-jobs: download + ${MAKE} prepare-allwikis + ${MAKE} translate-all-jobs + +# all2eng: +# for w in ${filter-out eng,${RELEASED_WIKIS}}; do \ +# make EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$w TRG=eng all-jobs; \ +# done + + +## do only the ones that we do not have already! + +new2trg: + for s in ${TATOEBA_TRANSLATABLE_WIKILANGS}; do \ + if [ ! -d $$s-eng ]; then \ + ${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$s TRG=${TRG} all-jobs; \ + fi \ + done + +all2eng: + ${MAKE} SRC=fin TRG=eng all2trg + +all2trg: + for s in ${TATOEBA_TRANSLATABLE_WIKILANGS}; do \ + ${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$s TRG=${TRG} all-jobs; \ + done + + +## translate English to all reliable target languages +eng2all: + ${MAKE} SRC=eng TRG=fin src2all + + +## translate current source language to all reliable target languages +src2all: + for t in ${TATOEBA_RELIABLE_TRG}; do \ + if [ ! -e ${SRC}-$$t/latest/${WIKISOURCE}.${PART}.${SRC}-$$t.$$t.gz ]; then \ + ${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=${SRC} TRG=$$t prepare; \ + ${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=${SRC} TRG=$$t translate.${SUBMIT_PREFIX}; \ + fi \ + done + + + + +RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}} +RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'} + +fetch-bt: + for d in ${RELEASED_BT}; do \ + echo "fetch $$d"; \ + mkdir -p `dirname $$d`; \ + wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ + done + +fetch-all-bt: + for d in ${RELEASED_BT_ALL}; do \ + echo "fetch $$d"; \ + mkdir -p `dirname $$d`; \ + wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ + done + + +#--------------------------------------------------------------- +# release data +#--------------------------------------------------------------- + +release-all: upload-all + ${MAKE} released-data.txt released-data-size.txt + +.PHONY: upload release +release upload: ${WIKI_LATEST_README} + swift upload ${BT_CONTAINER} --changed --skip-identical ${LANGPAIR}/latest + ${MAKE} released-data.txt + swift post ${BT_CONTAINER} --read-acl ".r:*" + +.PHONY: upload-all +upload-all: + for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \ + s=`echo $$d | cut -f1 -d'-'`; \ + t=`echo $$d | cut -f2 -d'-'`; \ + make SRC=$$s TRG=$$t ${@:-all=}; \ + done + +released-data.txt: . + swift list ${BT_CONTAINER} | grep -v README.md | grep -v '.txt' > $@ + swift upload ${BT_CONTAINER} $@ + +released-data-size.txt: . + ${MAKE} check-latest-all | grep '^[0-9]' > $@ + cat $@ | awk '{ sum += $$1 } END { print sum }' > $@.tmp + cat $@.tmp >> $@ + rm -f cat $@.tmp + swift upload ${BT_CONTAINER} released-data-size.txt + +# download released data + +.PHONY: download +download: ${WIKI_DIR}/${SRC} + + +#--------------------------------------------------------------- +# store / fetch translations +# (this is for storing work files and not for releasing data!) +#--------------------------------------------------------------- + +.PHONY: store +store: + a-put -b ${BT_WORK_CONTAINER} --nc --follow-links --override ${LANGPAIR} + +.PHONY: store-all +store-all: + for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \ + s=`echo $$d | cut -f1 -d'-'`; \ + t=`echo $$d | cut -f2 -d'-'`; \ + make SRC=$$s TRG=$$t ${@:-all=}; \ + done + +.PHONY: retrieve fetch +retrieve fetch: + cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/${LANGPAIR}.tar + + + + + + +.PHONY: prepare +prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${WIKI_TXT} + +.PHONY: prepare-allwikis +prepare-allwikis: ${LANGPAIR}/${MODELNAME}/decoder.yml ${ALLWIKIS_TXT} + +.PHONY: translate +translate: ${WIKI_LATEST_README} ${WIKI_LATEST_TRG} + ${MAKE} ${WIKI_LATEST_SRC} + +## translate all parts +.PHONY: translate-all-parts +translate-all-parts: ${ALLWIKIPARTS_LATEST_TRG} + ${MAKE} ${ALLWIKIPARTS_LATEST_SRC} + +## translate all wikis and all parts +.PHONY: translate-all +translate-all: + for s in ${WIKISOURCES}; do \ + ${MAKE} translate-allparts; \ + done + +## create jobs for translating all parts +## (only start the job if the file does not exist yet) +.PHONY: translate-all-parts-jobs +translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml + for p in ${PARTS}; do \ + if [ ! -e ${OUTPUT_DIR}/${WIKISOURCE}.$${p}_${MODELNAME}.${LANGPAIR}.${TRG}.gz ]; then \ + rm -f translate.${SUBMIT_PREFIX}; \ + ${MAKE} PART=$$p translate.${SUBMIT_PREFIX}; \ + fi \ + done + +## create jobs for translating all parts of all wikis +.PHONY: translate-all-jobs +translate-all-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml + for s in ${WIKISOURCES}; do \ + ${MAKE} WIKISOURCE=$$s translate-all-parts-jobs; \ + done + + + + + + +.PHONY: print-modelinfo +print-modelinfo: + @echo ${MODELNAME} + @echo ${MODELZIP} + @echo ${MODELINFO} + @echo "multi-target model: ${MULTI_TARGET_MODEL}" + @echo "target language label: ${TARGET_LANG_LABEL}" + + + + + +## fetch the latest model +## ---> TODO: should we fetch from ObjectStorage instead? + +${LANGPAIR}/${MODELNAME}/decoder.yml: +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + wget -O ${dir $@}/model.zip ${MODELZIP} + cd ${dir $@} && unzip model.zip + rm -f ${dir $@}/model.zip + mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh + sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \ + < ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh + chmod +x ${dir $@}/preprocess.sh +endif + + +## pre-process data + +ifeq (${MULTI_TARGET_MODEL},1) + PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm +else + PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm +endif + + + + +${WIKI_DIR}/${SRC}/%.${PART}.gz: | ${WIKI_DIR}/${SRC}/.done + ${GZCAT} ${@:.${PART}.gz=.txt.gz} |\ + split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@} + ${GZIP} -f ${patsubst %${PART}.gz,%,$@}?? + rm -f ${@:.${PART}.gz=.txt.gz} + +${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}/.done + echo "done!" + + +## NEW: get proper released WIKI data and extract the languages +## --> multiple languages can be included in one release (like nno in nor) +## --> shuffle the data as well + +# fetch +${WIKI_DIR}/${SRC}/data: + mkdir -p ${dir $@} + wget -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar + tar -C ${dir $@} -xf $@.tar + rm -f $@.tar + +# de-duplicate and shuffle +${WIKI_DIR}/${SRC}/${WIKISOURCE}.txt.gz: + ${MAKE} ${WIKI_DIR}/${SRC}/data + for f in `find ${dir $@} -name '*.id.gz'`; do \ + t=`echo $$f | sed 's/\.id\.gz/.txt.gz/'`; \ + l=`echo ${SRC} | sed 's/cmn/zho/;s/nob/nor.*/'`; \ + paste <(${GZIP} -cd $$f) <(${GZIP} -cd $$t) |\ + grep "^$$l " | cut -f2 | grep . | \ + ${UNIQ} | ${SHUFFLE} | ${GZIP} -c > ${dir $@}`basename $$t`; \ + done + rm -fr ${WIKI_DIR}/${SRC}/data + +# remove empty files +${WIKI_DIR}/${SRC}/.done: + mkdir -p ${dir $@} + ${MAKE} ${WIKI_DIR}/${SRC}/${WIKISOURCE}.txt.gz + for f in `find ${dir $@} -name '*.txt.gz'`; do \ + if [ ! `${GZIP} -cd $$f | head | wc -l` -gt 0 ]; then \ + rm -f $$f; \ + fi \ + done + touch $@ + + + + +## OLD: retrieve the old shuffled wiki release +## + +# ${WIKI_DIR}/${SRC}: +# mkdir -p $@ +# wget -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar +# tar -C ${dir $@} -xf $@.tar +# if [ -d ${WIKI_DIR}/data/${SRC} ]; then \ +# mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\ +# rm -f ${WIKI_DIR}/data/${SRC}/*;\ +# rmdir ${WIKI_DIR}/data/${SRC};\ +# rmdir ${WIKI_DIR}/data;\ +# fi +# if [ -e ${WIKI_DIR}/${SRC}/wiki.${SRC}.aa.gz ]; then \ +# for f in `ls ${WIKI_DIR}/${SRC}/*.${SRC}.*`; do \ +# mv $$f `echo $$f | sed 's/\.${SRC}\././'`; \ +# done \ +# fi +# rm -f $@.tar + + + +${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/${SRC}/%.${PART}.gz +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml + ${GZCAT} $< |\ + grep -v '[<>{}]' |\ + ${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\ + perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\ + gzip -f > $@ +endif + + + +## merge SentencePiece segments in the source text +## (Why? because we filter out some data from the original wiki text, see above) + +${WIKI_SRC}: ${WIKI_PRE} +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${GZCAT} $< |\ + sed 's/ //g;s/▁/ /g' | \ + sed 's/^ *//;s/ *$$//' |\ + sed 's/^>>[a-z]*<< //' |\ + gzip -c > $@ +endif + + + + +## overwrite the file with the latest translations +## --> this allows multiple translation iterations +## without duplicating the data we want to use in MT training + +${WIKI_LATEST_SRC}: ${WIKI_SRC} + mkdir -p ${dir $@} + cp $< $@ + +${WIKI_LATEST_TRG}: ${WIKI_TRG} + mkdir -p ${dir $@} + cp $< $@ + +${WIKI_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md + mkdir -p ${dir $@} + cp $< $@ + + +## translate + +%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml + ${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \ + -i ${PWD}/$< \ + -c decoder.yml \ + -d ${MARIAN_GPUS} \ + --quiet-translation \ + ${MARIAN_DECODER_FLAGS} |\ + sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\ + gzip -c > ${PWD}/$@ +#ifneq (${LANGPAIR},) +#ifneq (${MODELNAME},) +# rm -fr ${LANGPAIR}/${MODELNAME} +#endif +#endif +endif + + +check-latest: + @if [ -d ${LANGPAIR}/latest ]; then \ + for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ + done \ + fi + +check-translated: + @for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ + done + +check-length: + @echo "check ${LANGPAIR}" + @${MAKE} check-translated + @${MAKE} check-latest + + +remove-%-all check-%-all: + for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \ + s=`echo $$d | cut -f1 -d'-'`; \ + t=`echo $$d | cut -f2 -d'-'`; \ + make SRC=$$s TRG=$$t ${@:-all=}; \ + done + + + +remove-incomplete: + ${MAKE} remove-incomplete-translated + ${MAKE} remove-incomplete-latest + +remove-incomplete-translated: + @echo "check ${LANGPAIR}" + @mkdir -p ${LANGPAIR}/incomplete + @for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${LANGPAIR}/incomplete/; \ + mv $$T ${LANGPAIR}/incomplete/; \ + fi \ + done + + +remove-incomplete-latest: + @echo "check ${LANGPAIR}" + @mkdir -p ${LANGPAIR}/incomplete/latest + @if [ -d ${LANGPAIR}/latest ]; then \ + for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${LANGPAIR}/incomplete/latest/; \ + mv $$T ${LANGPAIR}/incomplete/latest/; \ + fi \ + done \ + fi + diff --git a/tatoeba/forward-translate/Makefile b/tatoeba/forward-translate/Makefile new file mode 100644 index 00000000..121cca61 --- /dev/null +++ b/tatoeba/forward-translate/Makefile @@ -0,0 +1,313 @@ +# +# forward translation to be used for +# knowledge distillation +# +# only works with sentencepiece models! +# +# TODO's +# +# - forward-translate monolingual data (re-use bt-data) +# - reconstruction filtering (score translation in opposite direction) +# (use weights? normalise-script from bergamot/students) +# - other kind of data filtering / selection? +# - create lexical shortlists (see bergamot) +# - finetune alphas in intgemm8 models (see bergamot) +# - benchmark distilled models +# + +PWD := ${shell pwd} +REPOHOME := ${PWD}/../../ + +include ${REPOHOME}lib/env.mk +include ${REPOHOME}lib/config.mk +include ${REPOHOME}lib/slurm.mk + + +SRC = fin +TRG = eng + + +## change decoder settings +## TODO: do we need this? + +MARIAN_BEAM_SIZE=1 +MARIAN_MINI_BATCH=100 +MARIAN_MAXI_BATCH=100 +MARIAN_MAX_LENGTH=200 +MARIAN_WORKSPACE=12000 + + +TATOEBA_VERSION ?= v2021-08-07 +TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION}) + +TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt +TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt +TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models + +## container for storing backtranslations +BT_CONTAINER = Tatoeba-MT-bt +BT_CWORK_ONTAINER = project-Tatoeba-MT-bt + +## split size in nr-of-lines +## default part to be selected = aa +SPLIT_SIZE ?= 1000000 + +## maximum input length (number sentence piece segments) +## maximum number of sentences to be translated (top N lines) +MAX_LENGTH ?= 200 +MAX_SENTENCES ?= ${SPLIT_SIZE} + +SORTLANGS = $(sort ${SRC} ${TRG}) +LANGPAIR = ${SRC}-${TRG} +SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}} + +PWD := $(shell pwd) + + + +# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar +MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4} +MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} +MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} + +MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} +ifneq (${MULTI_TARGET_MODEL},0) + TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} +endif + +RELEASED_BITEXTS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \ + grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'}) + +RELEASED_BITEXTS_REV = ${shell (for d in ${RELEASED_BITEXTS}; do echo $$d; done) | tac} + + +PART ?= aa +OUTPUT_DIR ?= ${LANGPAIR} + +BITEXT_DATADIR = ${PWD}/../work/data/simple +MODEL_WORKDIR = ${PWD}/../work/${LANGPAIR} +BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${SRC}.gz +BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz} + +BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${LANGPAIR} +BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz +BITEXT_PRE = ${BITEXT_BASE}.${SRC}.spm.${PART}.gz +BITEXT_TRG = ${BITEXT_BASE}.${TRG}.${PART}.gz + +BITEXT_LATEST_SRC = ${OUTPUT_DIR}/latest/Tatoeba-train.${PART}.${LANGPAIR}.${SRC}.gz +BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PART}.${LANGPAIR}.${TRG}.gz +BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md + + +## all parts of the bitext +PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}}) +ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz,${PARTS}} +ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz,${PARTS}} + + +## don't delete translated text even if the process crashes +.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz + +.PHONY: all +all: translate + +.PHONY: prepare +prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${BITEXT_PRE} + +.PHONY: translate +translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG} + ${MAKE} ${BITEXT_LATEST_SRC} + +## translate all parts +.PHONY: translate-all-parts +translate-all-parts: ${ALL_BITEXT_LATEST_TRG} + ${MAKE} source-all-parts + +.PHONY: source-all-parts +source-all-parts: ${ALL_BITEXT_LATEST_SRC} + + +.PHONY: print-modelinfo +print-modelinfo: + @echo ${MODELNAME} + @echo ${MODELZIP} + @echo ${MODELINFO} + @echo "multi-target model: ${MULTI_TARGET_MODEL}" + @echo "target language label: ${TARGET_LANG_LABEL}" + +## fetch the latest model + +${LANGPAIR}/${MODELNAME}/decoder.yml: +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + wget -O ${dir $@}/model.zip ${MODELZIP} + cd ${dir $@} && unzip model.zip + rm -f ${dir $@}/model.zip + mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh + sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \ + < ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh + chmod +x ${dir $@}/preprocess.sh +endif + + +## pre-process data + +ifeq (${MULTI_TARGET_MODEL},1) + PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm +else + PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm +endif + + +ifeq (${BITEXT_SRCPRE},) + +${BITEXT_SRCRAW}: + ${MAKE} -C .. SRCLANGS=${SRC} TRGLANGS=${TRG} clean-data-tatoeba + +else + +${BITEXT_SRCRAW}: ${BITEXT_SRCPRE} + sed 's/ //g;s/▁/ /g' < $< | sed 's/^ *//;s/ *$$//' | ${GZIP} -f > $@ + +endif + + +${BITEXT_PRE}: ${BITEXT_SRCRAW} +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml + ${GZCAT} $< |\ + grep -v '[<>{}]' |\ + ${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\ + perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\ + split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@} + ${GZIP} -f ${patsubst %${PART}.gz,%,$@}?? +endif + + +## merge SentencePiece segments in the source text +## (Why? because we filter out some data from the original wiki text, see above) + +${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz + if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \ + mkdir -p ${dir $@}; \ + ${GZCAT} $< |\ + sed 's/ //g;s/▁/ /g' | \ + sed 's/^ *//;s/ *$$//' |\ + sed 's/^>>[a-z]*<< //' |\ + gzip -c > $@; \ + fi + + +## overwrite the file with the latest translations +## --> this allows multiple translation iterations +## without duplicating the data we want to use in MT training + +${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz + mkdir -p ${dir $@} + cp $< $@ + +${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz + mkdir -p ${dir $@} + cp $< $@ + +${BITEXT_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md + mkdir -p ${dir $@} + cp $< $@ + + +## translate + +${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml + ${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && \ + ${MARIAN_DECODER} \ + -c decoder.yml \ + -i ${PWD}/$< \ + -d ${MARIAN_GPUS} \ + ${MARIAN_DECODER_FLAGS} |\ + sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\ + gzip -c > ${PWD}/$@ +endif + + + +check-latest: + @if [ -d ${LANGPAIR}/latest ]; then \ + for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ + done \ + fi + +check-translated: + @for S in `ls ${LANGPAIR}/*.${SRC}.spm.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ + done + +check-length: + @echo "check ${LANGPAIR}" + @${MAKE} check-translated + @${MAKE} check-latest + + +remove-%-all check-%-all: + for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \ + s=`echo $$d | cut -f1 -d'-'`; \ + t=`echo $$d | cut -f2 -d'-'`; \ + make SRC=$$s TRG=$$t ${@:-all=}; \ + done + + + +remove-incomplete: + ${MAKE} remove-incomplete-translated + ${MAKE} remove-incomplete-latest + +remove-incomplete-translated: + @echo "check ${LANGPAIR}" + @mkdir -p ${LANGPAIR}/incomplete + @for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${LANGPAIR}/incomplete/; \ + mv $$T ${LANGPAIR}/incomplete/; \ + fi \ + done + + +remove-incomplete-latest: + @echo "check ${LANGPAIR}" + @mkdir -p ${LANGPAIR}/incomplete/latest + @if [ -d ${LANGPAIR}/latest ]; then \ + for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${LANGPAIR}/incomplete/latest/; \ + mv $$T ${LANGPAIR}/incomplete/latest/; \ + fi \ + done \ + fi + diff --git a/tatoeba/langids-train-only-v2020-07-28.txt b/tatoeba/langids-train-only-v2020-07-28.txt new file mode 100644 index 00000000..e69de29b diff --git a/tatoeba/langids-train-only-v2021-08-07.txt b/tatoeba/langids-train-only-v2021-08-07.txt new file mode 100644 index 00000000..376438a0 --- /dev/null +++ b/tatoeba/langids-train-only-v2021-08-07.txt @@ -0,0 +1,1314 @@ +abk_Latn +aeb_Latn +afr_Brai +afr_Cyrl +afr_Kana +amh_Arab +amh_Cyrl +amh_Latn +ang +ang_Arab +ang_Beng +ang_Cyrl +ang_Deva +ang_Grek +ang_Gujr +ang_Guru +ang_Hang +ang_Hani +ang_Mlym +ang_Orya +ang_Taml +ara_Armn +ara_Beng +ara_Cyrl +ara_Deva +ara_Ethi +ara_Grek +ara_Hang +ara_Hani +ara_Hebr +ara_Hira +ara_Kana +ara_Latn_TN +ara_SY +ara_Syrc +ara_Thai +ara_TN +ara_Zinh +arg_Arab +arg_Cyrl +arg_Grek +arg_Hani +arg_Hebr +arq_Cyrl +ary_Latn +arz_Cyrl +arz_Grek +arz_Hani +arz_Hebr +arz_Latn +arz_Rohg +arz_Syrc +arz_Thaa +asm_Arab +asm_Cyrl +asm_Deva +asm_Gujr +asm_Tibt +ast_Arab +ast_Armn +ast_Beng +ast_Bopo +ast_Cyrl +ast_Deva +ast_Geor +ast_Grek +ast_Hang +ast_Hani +ast_Hebr +ast_Hira +ast_Kana +ast_Khmr +ast_Syrc +ast_Thaa +ast_Thai +ast_Tibt +ast_Yiii +ast_Zinh +azb +azb_Arab +azb_Cyrl +azb_Latn +aze +aze_Arab +aze_Arab_IR +aze_Hebr +aze_Latn_IR +bak_Arab +bak_Hebr +bak_Latn +bak_Mand +bak_Rohg +bak_Syrc +bak_Thaa +bal_Latn +bam_Telu +bar_Arab +bar_Cyrl +bar_Grek +bar_Hani +bar_Hebr +bar_Kana +bar_Rohg +bel_Arab +bel_Brai +bel_Deva +bel_Hani +bel_Hebr +ben_Arab +ben_Arab_IN +ben_Brai +ben_Cyrl +ben_Cyrl_IN +ben_Deva +ben_Deva_IN +ben_Ethi +ben_Gong +ben_Gran +ben_Gujr +ben_Gujr_IN +ben_Guru +ben_Hani +ben_Hebr +ben_Hira +ben_IN +ben_Kana +ben_Knda +ben_Latn +ben_Latn_IN +ben_Mlym +ben_Orya +ben_Phlp +ben_Rohg +ben_Sinh +ben_Syrc +ben_Takr +ben_Thai +ben_Tibt +ben_Zinh +ber_Kana +bho_Latn +bod_Latn +bos +bos_Adlm +bos_Arab +bos_Cyrl +bos_Deva +bos_Hani +bos_Hebr +bos_Hira +bos_Mani +bos_Phlp +bos_Syrc +bua_Latn +bul_Adlm +bul_Arab +bul_Deva +bul_Grek +bul_Hani +bul_Hebr +bul_Hira +bul_Kana +bul_Mand +bul_Mani +bul_Phlp +bxr_Cyrl +bxr_Latn +cat_Arab +cat_Brai +cat_Cyrl +cat_Grek +cat_Hani +cat_Hebr +cdo +cdo_Hans +cdo_Hant +cdo_Latn +ces_Arab +ces_Brai +ces_Cyrl +ces_Geor +ces_Grek +ces_Hang +ces_Hani +ces_Hebr +ces_Hira +ces_Kana +ces_Zinh +cha_Hang +che_Latn +chv_Latn +ckb_Latn +cmn-hans_Adlm_CN +cmn-hans_Arab_CN +cmn-hans_Bopo_CN +cmn-hans_CN +cmn-hans_Cyrl_CN +cmn-hans_Deva_CN +cmn-hans_Geor_CN +cmn-hans_Grek_CN +cmn-hans_Hani_CN +cmn-hans_Hans_CN +cmn-hans_Hant_CN +cmn-hans_Hebr_CN +cmn-hans_Hira_CN +cmn-hans_Kana_CN +cmn-hans_Latn_CN +cmn-hans_Mand_CN +cmn-hans_Mani_CN +cmn-hans_Phlp_CN +cmn-hans_Rohg_CN +cmn-hans_Sogd_CN +cmn-hans_Syrc_CN +cmn-hans_Thaa_CN +cmn-hans_Thai_CN +cmn-hant_Bopo_TW +cmn-hant_Cyrl_TW +cmn-hant_Geor_TW +cmn-hant_Grek_TW +cmn-hant_Hani_TW +cmn-hant_Hans_TW +cmn-hant_Hant_TW +cmn-hant_Hebr_TW +cmn-hant_Hira_TW +cmn-hant_Kana_TW +cmn-hant_Latn_TW +cmn-hant_Thai_TW +cmn-hant_TW +cnr +cnr_Latn +csb +cym_Cyrl +cym_Zinh +dan_Arab +dan_Cyrl +dan_Deva +dan_Grek +dan_Hani +dan_Hebr +dan_Hira +dan_Kana +deu_Arab +deu_AT +deu_Beng +deu_Brai +deu_CH +deu_Cyrl +deu_Deva +deu_Geor +deu_Grek +deu_Hang +deu_Hani +deu_Hebr +deu_Hira +deu_Kana +deu_Thai +deu_Zinh +div_Arab +div_Cyrl +div_Deva +div_Latn +dty +dty_Latn +ell_Adlm +ell_Arab +ell_Beng +ell_Cyrl +ell_Deva +ell_Ethi +ell_Hani +ell_Hebr +ell_Hira +ell_Kana +ell_Mand +ell_Mani +ell_Phlp +ell_Sogd +ell_Syrc +ell_Thai +eng_Arab +eng_Armn +eng_AU +eng_Beng +eng_Brai +eng_CA +eng_Deva +eng_Deva_CA +eng_Deva_GB +eng_GB +eng_Geor +eng_Grek +eng_Guru +eng_Hang +eng_Hani +eng_Hani_GB +eng_Hebr +eng_Hira +eng_Java +eng_Kana +eng_Khmr +eng_Knda +eng_Laoo +eng_Mand +eng_Mani +eng_Mlym +eng_NZ +eng_Phlp +eng_Rohg +eng_Sinh +eng_Syrc +eng_Taml +eng_Telu +eng_Thai +eng_Tibt +eng_Yiii +eng_ZA +eng_Zinh +epo_Arab +epo_Cyrl +epo_Deva +epo_Grek +epo_Hebr +epo_Syrc +est_Brai +est_Cyrl +est_Hang +eus_Cyrl +fas +fas_Beng +fas_Cyrl +fas_Deva +fas_Grek +fas_Hani +fas_Hebr +fas_Hira +fas_Kana +fas_Latn +fas_Zinh +fin_Arab +fin_Brai +fin_Cyrl +fin_Hang +fin_Hani +fin_Hebr +fin_Hira +fin_Kana +fin_Khmr +fin_Thai +fra_Arab +fra_Beng +fra_Brai +fra_CA +fra_Cyrl +fra_Deva +fra_Geor +fra_Grek +fra_Hang +fra_Hani +fra_Hebr +fra_Hira +fra_Kana +fra_Khmr +fra_Mlym +fra_Thai +fra_Zinh +ful +ful_Arab +ful_Cyrl +ful_Grek +ful_Hang +ful_Hani +ful_Hebr +ful_Rohg +ful_Syrc +fur +gan_Latn +gcf +glg_Cyrl +glg_Grek +glg_Hira +gom_Knda +gom_Latn +got +got_Latn +gug +guj_Arab +guj_Cyrl +guj_Latn +hak_Latn +hau_Arab +hau_Cyrl +hau_Hebr +hau_Zinh +hbs +hbs_Arab +hbs_Armn +hbs_Beng +hbs_Copt +hbs_Cyrl +hbs_Deva +hbs_Geor +hbs_Grek +hbs_Gujr +hbs_Hang +hbs_Hani +hbs_Hebr +hbs_Hira +hbs_Kana +hbs_Khmr +hbs_Latn +hbs_Mlym +hbs_Nkoo +hbs_Sogd +hbs_Syrc +hbs_Thai +heb_Adlm +heb_Arab +heb_Armn +heb_Beng +heb_Cans +heb_Cyrl +heb_Deva +heb_Ethi +heb_Geor +heb_Grek +heb_Guru +heb_Hang +heb_Hani +heb_Hira +heb_Kana +heb_Knda +heb_Mand +heb_Mani +heb_Mlym +heb_Mymr +heb_Orya +heb_Phlp +heb_Rohg +heb_Samr +heb_Sinh +heb_Sogd +heb_Syrc +heb_Taml +heb_Thai +heb_Tibt +heb_Yiii +heb_Zinh +hif +hin_Arab +hin_Beng +hin_Cyrl +hin_Grek +hin_Hani +hin_Hebr +hin_Hira +hin_Kana +hin_Mahj +hin_Tibt +hmn +hrv_Arab +hrv_Brai +hrv_Cyrl +hrv_Hani +hrv_Hira +hrv_Kana +hun_Arab +hun_Brai +hun_Cyrl +hun_Deva +hun_Hang +hun_Hani +hun_Hebr +hun_Hira +hun_Kana +hun_Zinh +hye_Arab +hye_Cyrl +hye_Hani +hye_Thaa +ido_Adlm +ido_Arab +ido_Cyrl +ido_Deva +ido_Grek +ido_Hang +ido_Hani +ido_Hebr +ido_Hira +ido_Mand +ido_Phlp +ido_Rohg +ido_Syrc +iku_Cans +iku_Latn +ilo_Arab +ilo_Cyrl +ilo_Deva +ilo_Hang +ilo_Hani +ilo_Taml +ind_Arab +ind_Brai +ind_Cyrl +ind_Deva +ind_Grek +ind_Hang +ind_Hani +ind_Hebr +ind_Hira +ind_Kana +ind_Yiii +ind_Zinh +isl_Cyrl +isl_Geor +isl_Hani +ita_Arab +ita_Beng +ita_Brai +ita_Cyrl +ita_Deva +ita_Grek +ita_Hang +ita_Hani +ita_Hebr +ita_Kana +ita_Thai +ita_Zinh +jak_Latn +jav_Arab +jav_Armn +jav_Cyrl +jav_Deva +jav_Ethi +jav_Geor +jav_Grek +jav_Gujr +jav_Hang +jav_Hani +jav_Hebr +jav_Hira +jav_Kana +jav_Thai +jav_Zinh +jpn_Adlm +jpn_Arab +jpn_Armn +jpn_Beng +jpn_Brai +jpn_Cans +jpn_Cyrl +jpn_Deva +jpn_Dogr +jpn_Ethi +jpn_Geor +jpn_Glag +jpn_Gong +jpn_Gonm +jpn_Gran +jpn_Grek +jpn_Gujr +jpn_Guru +jpn_Hebr +jpn_Java +jpn_Khmr +jpn_Knda +jpn_Laoo +jpn_Mahj +jpn_Mand +jpn_Mani +jpn_Mlym +jpn_Mong +jpn_Mult +jpn_Mymr +jpn_Nand +jpn_Nkoo +jpn_Orya +jpn_Phlp +jpn_Rohg +jpn_Shrd +jpn_Sind +jpn_Sinh +jpn_Sogd +jpn_Sylo +jpn_Syrc +jpn_Takr +jpn_Taml +jpn_Telu +jpn_Thaa +jpn_Thai +jpn_Tibt +jpn_Tirh +jpn_Vaii +jpn_Zinh +kab_Grek +kam +kan_Cyrl +kan_Latn +kat_Arab +kat_Cyrl +kat_Deva +kat_Grek +kat_Hani +kat_Hebr +kat_Kana +kaz +kaz_Arab +kaz_Hani +kaz_Hebr +khm_Arab +khm_Cyrl +khm_Hani +khm_Kana +kir_Arab +kir_Deva +kir_Latn +kmr +kok_Latn +kom +kor_Adlm +kor_Arab +kor_Armn +kor_Beng +kor_Bopo +kor_Brai +kor_Cans +kor_Cher +kor_Copt +kor_Cyrl +kor_Deva +kor_Dogr +kor_Ethi +kor_Geor +kor_Gong +kor_Gonm +kor_Gran +kor_Grek +kor_Gujr +kor_Guru +kor_Hebr +kor_Hira +kor_Java +kor_Kana +kor_Khmr +kor_Khoj +kor_Knda +kor_Kthi +kor_Mahj +kor_Mand +kor_Mani +kor_Mlym +kor_Mong +kor_Mult +kor_Mymr +kor_Nand +kor_Orya +kor_Phlp +kor_Rohg +kor_Sind +kor_Sinh +kor_Sylo +kor_Syrc +kor_Takr +kor_Taml +kor_Telu +kor_Thaa +kor_Thai +kor_Tibt +kor_Tirh +kor_Yiii +kor_Zinh +kpv_Latn +kur_Cyrl +kur_Hebr +lao_Arab +lao_Cyrl +lao_Hani +lat_Arab +lat_Hebr +lav_Arab +lav_Cyrl +lav_Hang +lav_Hani +lav_Zinh +lit_Arab +lit_Brai +lit_Cyrl +lit_Geor +lit_Hang +lit_Hani +lit_Hebr +lit_Hira +lit_Thai +lkt_Hang +lld +lmo_Arab +lmo_Cyrl +lmo_Grek +ltz_Hani +lzh_Grek +mai_Latn +mal_Arab +mal_Beng +mal_Cyrl +mal_Deva +mal_Ethi +mal_Hani +mal_Hebr +mal_Hira +mal_Latn +mal_Syrc +mal_Zinh +mar_Arab +mar_Beng +mar_Brai +mar_Cyrl +mar_Hani +mar_Hebr +mar_Kana +mar_Latn +mar_Zinh +mhr_Latn +mkd_Arab +mkd_Deva +mkd_Grek +mkd_Hani +mkd_Hebr +mkd_Latn +mkd_Mani +mkd_Syrc +mkd_Thaa +mlg_Zinh +mlt_Grek +mni_Mtei +mnw_Latn +mol +mon_Arab +mon_Hang +mon_Hani +mon_Hebr +mon_Latn +mon_Mong +mon_Phag +mrj +mrj_Latn +msa_Arab +msa_Brai +msa_Cyrl +msa_Deva +msa_Grek +msa_Hang +msa_Hani +msa_Hebr +msa_Hira +msa_Latn +mus_Grek +mwl_Hani +mya_Arab +mya_Cakm +mya_Cyrl +mya_Ethi +mya_Hani +mya_Hebr +mya_Hira +mya_Latn +mya_Tale +myv_Latn +nan_Kana +nds_Arab +nds_Beng +nds_Cyrl +nds_Deva +nds_Grek +nds_Guru +nds_Hani +nds_Hebr +nds_Hira +nds_Kana +nds_Mlym +nds_NL +nds_Orya +nds_Taml +nep +nep_Arab +nep_Beng +nep_Cyrl +nep_Hang +nep_Hebr +nep_Kana +nep_Latn +nep_Zinh +ngu +nld_Arab +nld_Brai +nld_Cyrl +nld_Geor +nld_Grek +nld_Hang +nld_Hani +nld_Hebr +nld_Hira +nld_Kana +nld_Zinh +nno_Deva +nob_Arab +nob_Beng +nob_Cyrl +nob_Deva +nob_Geor +nob_Grek +nob_Guru +nob_Hani +nob_Hira +nob_Mlym +nob_Orya +nob_Taml +nor_Arab +nor_Cyrl +nor_Deva +nor_Geor +nor_Grek +nor_Hang +nor_Hani +nor_Hebr +nor_Hira +nor_Kana +nor_Latn +nor_Zinh +ori_Cyrl +ori_Ethi +ori_Latn +oss_Latn +pan +pan_Arab +pan_Beng +pan_Cyrl +pan_Latn +plt +pnb_Guru +pob +pob_Arab +pob_Cyrl +pob_Grek +pob_Hang +pob_Hani +pob_Hebr +pob_Hira +pol_Arab +pol_Brai +pol_Cyrl +pol_Deva +pol_Geor +pol_Grek +pol_Hang +pol_Hani +pol_Hebr +pol_Hira +pol_Kana +pol_Khmr +pol_Syrc +pol_Zinh +por_Arab +por_Brai +por_Cyrl +por_Deva +por_Grek +por_Hang +por_Hebr +por_Kana +por_PT +por_Syrc +por_Zinh +prs +prs_Latn +pus_Cyrl +pus_Deva +pus_Ethi +pus_Hani +pus_Hebr +pus_Latn +pus_Syrc +quw +quy +quz +qvi +rmc_SK +rmn +rmn_Cyrl +rmn_Grek +rmy +rmy_Cyrl +ron_Adlm +ron_Arab +ron_Brai +ron_Cyrl +ron_Hang +ron_Hani +ron_Hebr +ron_Hira +ron_Mani +ron_Phlp +ron_Rohg +ron_Sogd +ron_Syrc +ron_Tibt +rus_Adlm +rus_Arab +rus_Armn +rus_Beng +rus_Bopo +rus_Brai +rus_Deva +rus_Ethi +rus_Geor +rus_Grek +rus_Guru +rus_Hang +rus_Hani +rus_Hebr +rus_Hira +rus_Kana +rus_Khmr +rus_Mand +rus_Mani +rus_Mong +rus_Phlp +rus_Rohg +rus_Runr +rus_Sogd +rus_Syrc +rus_Thaa +rus_Tibt +rus_Zinh +sah_Latn +san +san_Latn +sin_Arab +sin_Cyrl +sin_Deva +sin_Hani +sin_Hebr +sin_Latn +sin_Zinh +slv_Brai +slv_Cyrl +slv_Deva +slv_Hang +slv_Hani +slv_Hebr +slv_Hira +snd_Cyrl +snd_Latn +spa_AR +spa_Arab +spa_Armn +spa_Beng +spa_Brai +spa_CL +spa_CO +spa_CR +spa_Cyrl +spa_Deva +spa_DO +spa_EC +spa_Grek +spa_GT +spa_Hang +spa_Hani +spa_Hebr +spa_Hira +spa_HN +spa_Kana +spa_Khmr +spa_Mand +spa_MX +spa_NI +spa_PA +spa_PE +spa_Phlp +spa_PR +spa_Rohg +spa_SV +spa_Syrc +spa_Taml +spa_Telu +spa_Thai +spa_UY +spa_VE +spa_Zinh +sqi_Brai +sqi_Kana +sqi_Tibt +sqi_Zinh +srp +srp_Arab +srp_Beng +srp_Brai +srp_Deva +srp_Ethi +srp_Hani +srp_Hebr +srp_Hira +srp_Kana +srp_Latn_ME +srp_Mani +srp_Syrc +srp_Zinh +swa +swa_Cyrl +swa_Hani +swe_Arab +swe_Brai +swe_Cyrl +swe_Deva +swe_Grek +swe_Hani +swe_Hebr +swe_Hira +swe_Kana +syr_Syrc +tam_Arab +tam_Cyrl +tam_Deva +tam_Deva_LK +tam_Grek +tam_Hani +tam_Hebr +tam_Latn +tam_Latn_LK +tam_LK +tam_Syrc +tam_Zinh +tat_Adlm +tat_Deva +tat_Grek +tat_Hang +tat_Hebr +tat_Mand +tat_Phlp +tat_Rohg +tat_Syrc +tat_Thaa +tel_Arab +tel_Cyrl +tel_Latn +tgk_Arab +tgk_Latn +tgl_Arab +tgl_Armn +tgl_Beng +tgl_Bopo +tgl_Brai +tgl_Cans +tgl_Cher +tgl_Copt +tgl_Cyrl +tgl_Deva +tgl_Ethi +tgl_Geor +tgl_Grek +tgl_Gujr +tgl_Guru +tgl_Hang +tgl_Hani +tgl_Hebr +tgl_Hira +tgl_Java +tgl_Kana +tgl_Laoo +tgl_Lepc +tgl_Mani +tgl_Nkoo +tgl_Phlp +tgl_Rohg +tgl_Syrc +tgl_Telu +tgl_Thaa +tgl_Thai +tgl_Tibt +tgl_Yiii +tgl_Zinh +tha_Adlm +tha_Arab +tha_Beng +tha_Cyrl +tha_Deva +tha_Ethi +tha_Grek +tha_Hani +tha_Hebr +tha_Hira +tha_Kana +tha_Khmr +tha_Latn +tha_Mand +tha_Mani +tha_Phlp +tha_Rohg +tha_Syrc +tha_Tibt +tha_Zinh +tir_Arab +tir_Latn +tmh +toi +tur_Arab +tur_Armn +tur_Brai +tur_Deva +tur_Geor +tur_Grek +tur_Hang +tur_Hani +tur_Hebr +tur_Hira +tur_Kana +tur_Thai +tur_Zinh +udm_Latn +uig_Hang +uig_Hebr +ukr_Adlm +ukr_Arab +ukr_Beng +ukr_Brai +ukr_Deva +ukr_Ethi +ukr_Geor +ukr_Grek +ukr_Hang +ukr_Hani +ukr_Hebr +ukr_Hira +ukr_Kana +ukr_Latn +ukr_Mand +ukr_Mani +ukr_Phlp +ukr_Sogd +ukr_Syrc +umb_Cyrl +urd_Beng +urd_Cyrl +urd_Deva +urd_Ethi +urd_Hani +urd_Hebr +urd_Hira +urd_Latn +urd_Zinh +uzb_Arab +vie_Arab +vie_Beng +vie_Brai +vie_Cyrl +vie_Geor +vie_Hang +vie_Hebr +vie_Hira +vie_Kana +vie_Khmr +vie_Thaa +vie_Thai +vie_Zinh +wuu_Adlm +wuu_Arab +wuu_Beng +wuu_Cyrl +wuu_Deva +wuu_Grek +wuu_Hebr +wuu_Laoo +wuu_Mani +wuu_Mymr +wuu_Phlp +wuu_Taml +wuu_Thaa +wuu_Thai +xal_Latn +xho_Cyrl +xmf_Latn +yid_Adlm +yid_Arab +yid_Cyrl +yid_Hani +yid_Latn +yid_Phlp +yor_Brai +yue +yue_Hira +yue_Kana +ze_zh +ze_zh_Bopo +ze_zh_Cyrl +ze_zh_Grek +ze_zh_Hang +ze_zh_Hani +ze_zh_Hans +ze_zh_Hant +ze_zh_Hira +ze_zh_Kana +ze_zh_Latn +ze_zh_Thai +ze_zh_Yiii +ze_zh_Zinh +zho +zho_Arab +zho_Arab_CN +zho_Arab_HK +zho_Arab_TW +zho_Armn +zho_Armn_CN +zho_Armn_TW +zho_Beng +zho_Beng_CN +zho_Beng_TW +zho_Bopo +zho_Bopo_CN +zho_Bopo_HK +zho_Bopo_TW +zho_Brai +zho_Brai_CN +zho_Brai_TW +zho_CN +zho_Cyrl +zho_Cyrl_CN +zho_Cyrl_HK +zho_Cyrl_TW +zho_Deva +zho_Deva_CN +zho_Deva_HK +zho_Deva_TW +zho_Ethi +zho_Ethi_TW +zho_Grek +zho_Grek_CN +zho_Grek_TW +zho_Gujr_CN +zho_Gujr_HK +zho_Gujr_TW +zho_Hang +zho_Hang_CN +zho_Hang_TW +zho_Hani +zho_Hani_CN +zho_Hani_HK +zho_Hani_TW +zho_Hans +zho_Hans_CN +zho_Hans_HK +zho_Hans_TW +zho_Hant +zho_Hant_CN +zho_Hant_HK +zho_Hant_TW +zho_Hebr +zho_Hebr_CN +zho_Hebr_TW +zho_Hira +zho_Hira_CN +zho_Hira_TW +zho_Kana +zho_Latn +zho_Latn_CN +zho_Latn_HK +zho_Latn_TW +zho_Nkoo_TW +zho_Phag +zho_Phag_CN +zho_Phlp_CN +zho_Rohg_TW +zho_Sogd_CN +zho_Syrc +zho_Syrc_CN +zho_Syrc_TW +zho_Thaa +zho_Thai +zho_Thai_CN +zho_Thai_TW +zho_Tibt +zho_Tibt_CN +zho_Tibt_TW +zho_TW +zho_Zinh +zho_Zinh_CN +zho_Zinh_TW +zhs +zhs_Arab +zhs_Beng +zhs_Bopo +zhs_Cyrl +zhs_Hang +zhs_Hani +zhs_Hans +zhs_Hant +zhs_Hira +zhs_Kana +zhs_Latn +zhs_Mand +zht +zht_Arab +zht_Bopo +zht_Cyrl +zht_Geor +zht_Hang +zht_Hani +zht_Hans +zht_Hant +zht_Hira +zht_Kana +zht_Latn +zht_Rohg +zht_Thai +zht_Yiii +zlm diff --git a/tatoeba/pivoting/Makefile b/tatoeba/pivoting/Makefile new file mode 100644 index 00000000..be83e1e4 --- /dev/null +++ b/tatoeba/pivoting/Makefile @@ -0,0 +1,302 @@ +# +# translate PIVOT language into SRC language +# to make a synthetic SRC-TRG corpus from another +# PIVOT-TRG corpus + + +PWD := ${shell pwd} +REPOHOME := ${PWD}/../../ + +include ${REPOHOME}lib/env.mk +include ${REPOHOME}lib/config.mk +include ${REPOHOME}lib/slurm.mk + + +SRC = swe +TRG = fin +PIVOT = eng + + +## change decoder settings +## TODO: do we need this? + +MARIAN_BEAM_SIZE=1 +MARIAN_MINI_BATCH=100 +MARIAN_MAXI_BATCH=100 +MARIAN_MAX_LENGTH=200 +MARIAN_WORKSPACE=12000 + + +TATOEBA_VERSION ?= v2021-08-07 +TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION}) + +TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt +TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt +TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models + +## container for storing backtranslations +BT_CONTAINER = Tatoeba-MT-bt +BT_CWORK_ONTAINER = project-Tatoeba-MT-bt + +## split size in nr-of-lines +## default part to be selected = aa +SPLIT_SIZE ?= 1000000 + +## maximum input length (number sentence piece segments) +## maximum number of sentences to be translated (top N lines) +MAX_LENGTH ?= 200 +MAX_SENTENCES ?= ${SPLIT_SIZE} + +TRANSLATE_LANGPAIR = ${PIVOT}-${SRC} +ORIGINAL_LANGPAIR = ${PIVOT}-${TRG} +NEW_LANGPAIR = ${SRC}-${TRG} + +SORTLANGS = $(sort ${PIVOT} ${TRG}) +SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}} + +PART ?= aa +OUTPUT_DIR ?= ${NEW_LANGPAIR} + + +# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar +MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4} +MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} +MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} +MODELDIR = ${OUTPUT_DIR}/${TRANSLATE_LANGPAIR}/${MODELNAME} + +MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} +ifneq (${MULTI_TARGET_MODEL},0) + TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} +endif + + + +BITEXT_DATADIR = ${PWD}/../work/data/simple +MODEL_WORKDIR = ${PWD}/../work/${PIVOT}-${TRG} +BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${PIVOT}.gz +BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz} + +BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${PIVOT}-${SRC}-${TRG} +BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz +BITEXT_PRE = ${BITEXT_BASE}.${SRC}.spm.${PART}.gz +BITEXT_TRG = ${BITEXT_BASE}.${TRG}.${PART}.gz + +BITEXT_LATEST_SRC = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.${PART}.${SRC}.gz +BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.${PART}.${TRG}.gz +BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md + + +## all parts of the bitext +PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}}) +ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz,${PARTS}} +ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz,${PARTS}} + + +## don't delete translated text even if the process crashes +.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz + +.PHONY: all +all: translate + +.PHONY: prepare +prepare: ${MODELDIR}/decoder.yml ${BITEXT_PRE} + +.PHONY: translate +translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG} + ${MAKE} ${BITEXT_LATEST_SRC} + +## translate all parts +.PHONY: translate-all-parts +translate-all-parts: ${ALL_BITEXT_LATEST_TRG} + ${MAKE} source-all-parts + +.PHONY: source-all-parts +source-all-parts: ${ALL_BITEXT_LATEST_SRC} + + +.PHONY: print-modelinfo +print-modelinfo: + @echo ${MODELNAME} + @echo ${MODELZIP} + @echo ${MODELINFO} + @echo "multi-target model: ${MULTI_TARGET_MODEL}" + @echo "target language label: ${TARGET_LANG_LABEL}" + +## fetch the latest model + +${MODELDIR}/decoder.yml: +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + wget -O ${dir $@}/model.zip ${MODELZIP} + cd ${dir $@} && unzip model.zip + rm -f ${dir $@}/model.zip + mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh + sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \ + < ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh + chmod +x ${dir $@}/preprocess.sh +endif + + +## pre-process data + +ifeq (${MULTI_TARGET_MODEL},1) + PREPROCESS_ARGS = ${SRC} ${TRG} ${MODELDIR}/source.spm +else + PREPROCESS_ARGS = ${SRC} ${MODELDIR}/source.spm +endif + + + +ifeq (${BITEXT_SRCPRE},) + +${BITEXT_SRCRAW}: + ${MAKE} -C .. SRCLANGS=${PIVOT} TRGLANGS=${TRG} clean-data + +else + +${BITEXT_SRCRAW}: ${BITEXT_SRCPRE} + sed 's/ //g;s/▁/ /g' < $< | sed 's/^ *//;s/ *$$//' | ${GZIP} -f > $@ + +endif + + +${BITEXT_PRE}: ${BITEXT_SRCRAW} +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${MAKE} ${MODELDIR}/decoder.yml + ${GZCAT} $< |\ + grep -v '[<>{}]' |\ + ${MODELDIR}/preprocess.sh ${PREPROCESS_ARGS} |\ + perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\ + split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@} + ${GZIP} -f ${patsubst %${PART}.gz,%,$@}?? +endif + + + +## merge SentencePiece segments in the source text +## (Why? because we filter out some data from the original wiki text, see above) + +${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz + if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \ + mkdir -p ${dir $@}; \ + ${GZCAT} $< |\ + sed 's/ //g;s/▁/ /g' | \ + sed 's/^ *//;s/ *$$//' |\ + sed 's/^>>[a-z]*<< //' |\ + gzip -c > $@; \ + fi + + +## overwrite the file with the latest translations +## --> this allows multiple translation iterations +## without duplicating the data we want to use in MT training + +${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz + mkdir -p ${dir $@} + cp $< $@ + +${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz + mkdir -p ${dir $@} + cp $< $@ + +${BITEXT_LATEST_README}: ${MODELDIR}/README.md + mkdir -p ${dir $@} + cp $< $@ + + +## translate + +${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz +ifneq (${MODELZIP},) + mkdir -p ${dir $@} + ${MAKE} ${MODELDIR}/decoder.yml + ${LOAD_ENV} && cd ${MODELDIR} && \ + ${MARIAN_DECODER} \ + -c decoder.yml \ + -i ${PWD}/$< \ + -d ${MARIAN_GPUS} \ + ${MARIAN_DECODER_FLAGS} |\ + sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\ + gzip -c > ${PWD}/$@ +endif + + + +check-latest: + @if [ -d ${OUTPUT_DIR}/latest ]; then \ + for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ + done \ + fi + +check-translated: + @for S in `ls ${OUTPUT_DIR}/*.${SRC}.spm.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ + done + +check-length: + @echo "check ${OUTPUT_DIR}" + @${MAKE} check-translated + @${MAKE} check-latest + + +remove-%-all check-%-all: + for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \ + s=`echo $$d | cut -f1 -d'-'`; \ + t=`echo $$d | cut -f2 -d'-'`; \ + make SRC=$$s TRG=$$t ${@:-all=}; \ + done + + + +remove-incomplete: + ${MAKE} remove-incomplete-translated + ${MAKE} remove-incomplete-latest + +remove-incomplete-translated: + @echo "check ${OUTPUT_DIR}" + @mkdir -p ${OUTPUT_DIR}/incomplete + @for S in `ls ${OUTPUT_DIR}/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${OUTPUT_DIR}/incomplete/; \ + mv $$T ${OUTPUT_DIR}/incomplete/; \ + fi \ + done + + +remove-incomplete-latest: + @echo "check ${OUTPUT_DIR}" + @mkdir -p ${OUTPUT_DIR}/incomplete/latest + @if [ -d ${OUTPUT_DIR}/latest ]; then \ + for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \ + T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${OUTPUT_DIR}/incomplete/latest/; \ + mv $$T ${OUTPUT_DIR}/incomplete/latest/; \ + fi \ + done \ + fi +