cleanup in tatoeba data recipes

This commit is contained in:
Joerg Tiedemann 2021-12-18 00:27:04 +02:00
parent 5fc902b020
commit d617a63c76
16 changed files with 3091 additions and 994 deletions

View File

@ -4,6 +4,16 @@
#
# only works with sentencepiece models!
#
# TODO's
#
# - forward-translate monolingual data (re-use bt-data)
# - reconstruction filtering (score translation in opposite direction)
# (use weights? normalise-script from bergamot/students)
# - other kind of data filtering / selection?
# - create lexical shortlists (see bergamot)
# - finetune alphas in intgemm8 models (see bergamot)
# - benchmark distilled models
#
PWD := ${shell pwd}
REPOHOME := ${PWD}/../
@ -112,7 +122,7 @@ translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
${MAKE} ${ALL_BITEXT_LATEST_SRC}
${MAKE} source-all-parts
.PHONY: source-all-parts
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
@ -183,16 +193,14 @@ endif
## (Why? because we filter out some data from the original wiki text, see above)
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
ifneq ($(wildcard ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@}),)
mkdir -p ${dir $@}
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@
endif
if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \
mkdir -p ${dir $@}; \
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@; \
fi
## overwrite the file with the latest translations

View File

@ -88,9 +88,12 @@ SKIP_SAME_LANG ?= 0
## --> especially useful in connection with FIT_DATA_SIZE
## set DATA_IS_SHUFFLED=1 if the training data is already shuffled
## --> useful to avoid shuffling when training sentence piece model
## NEW (2021-12-16): SHUFFLE_DATA is now set by default
## --> can now also avoid sqlite and data shuffling inside MarianNMT
## --> is that a problem (would MarianNMT use different random shuffles / epoch?)
##----------------------------------------------------------------------
# SHUFFLE_DATA = 1
SHUFFLE_DATA = 1
# DATA_IS_SHUFFLED = 1
## devtest data is shuffled by default
@ -142,9 +145,9 @@ SORTSRC = ${firstword ${SORTLANGS}}
SORTTRG = ${lastword ${SORTLANGS}}
LANGPAIR = ${SORTSRC}-${SORTTRG}
SPACE = $(empty) $(empty)
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
## for monolingual things
@ -179,17 +182,17 @@ endif
## NEW default size = 2500 (keep more for training for small languages)
## NOTE: size will be increased to 5000 for Tatoeba
DEVSIZE = 2500
TESTSIZE = 2500
DEVSIZE ?= 2500
TESTSIZE ?= 2500
## set some additional thresholds for
## the size of test and dev data
## DEVMINSIZE is the absolute minimum we require
## to run any training procedures
DEVSMALLSIZE = 1000
TESTSMALLSIZE = 1000
DEVMINSIZE = 250
DEVSMALLSIZE ?= 1000
TESTSMALLSIZE ?= 1000
DEVMINSIZE ?= 250
## set additional argument options for opus_read (if it is used)
@ -486,12 +489,14 @@ MARIAN_CLIP_NORM ?= 5
## default = shuffle data and batches
## (set to batches or none to change this)
MARIAN_SHUFFLE ?= data
# MARIAN_SHUFFLE ?= data
MARIAN_SHUFFLE ?= batches
## default: use sqlite database to store data
## remove this to use regular temp data
## set to --shuffle-in-ram to keep all shuffled data in RAM
MARIAN_DATA_STORAGE ?= --sqlite
# MARIAN_DATA_STORAGE ?= --sqlite
## set to global for lower memory usage in multiprocess training
## TODO: does this parameter really work?
@ -596,11 +601,11 @@ endif
.PHONY: config local-config
config local-config: ${WORKDIR}/${MODELCONFIG}
SMALLEST_TRAINSIZE = 10000
SMALL_TRAINSIZE = 100000
MEDIUM_TRAINSIZE = 500000
LARGE_TRAINSIZE = 1000000
LARGEST_TRAINSIZE = 10000000
SMALLEST_TRAINSIZE ?= 10000
SMALL_TRAINSIZE ?= 100000
MEDIUM_TRAINSIZE ?= 500000
LARGE_TRAINSIZE ?= 1000000
LARGEST_TRAINSIZE ?= 10000000
${WORKDIR}/${MODELCONFIG}:
mkdir -p ${dir $@}

View File

@ -55,9 +55,8 @@ endif
## - use only the latest backtranslations
## if such a subdir exists
BACKTRANS_HOME = backtranslate
FORWARDTRANS_HOME = ${BACKTRANS_HOME}
# FORWARDTRANS_HOME = ${BACKTRANS_HOME}
BACKTRANS_HOME ?= backtranslate
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}/latest
@ -71,6 +70,13 @@ else
FORWARDTRANS_DIR = ${FORWARDTRANS_HOME}/${SRC}-${TRG}
endif
ifneq (${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest},)
FORWARDTRANSMONO_DIR = ${BACKTRANS_HOME}/${SRC}-${TRG}/latest
else
FORWARDTRANSMONO_DIR = ${BACKTRANS_HOME}/${SRC}-${TRG}
endif
## TODO: make it possible to select only parts of the BT data
## ---> use TRAINDATA_SIZE to take max the same amount of all shuffled BT data
@ -85,6 +91,11 @@ ifeq (${USE_FORWARDTRANS},1)
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
endif
ifeq (${USE_FORWARDTRANSMONO},1)
FORWARDTRANSMONO_SRC = ${sort ${wildcard ${FORWARDTRANSMONO_DIR}/*.${SRCEXT}.gz}}
FORWARDTRANSMONO_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANSMONO_SRC}}
endif
ifeq (${USE_PIVOTING},1)
PIVOTING_SRC = ${sort ${wildcard pivoting/${SRC}-${TRG}/latest/*.${SRCEXT}.gz} \
${wildcard pivoting/${TRG}-${SRC}/latest/*.${SRCEXT}.gz}}
@ -95,6 +106,10 @@ print-ft-data:
@echo ${FORWARDTRANS_SRC}
@echo ${FORWARDTRANS_TRG}
@echo ${FORWARDTRANS_DIR}
@echo ${FORWARDTRANSMONO_SRC}
@echo ${FORWARDTRANSMONO_TRG}
@echo ${FORWARDTRANSMONO_DIR}
##-------------------------------------------------------------
## data sets (train/dev/test)
@ -104,7 +119,7 @@ print-ft-data:
## with some basic pre-processing (see lib/preprocess.mk)
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${PIVOTING_SRC}
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}}
@ -239,6 +254,8 @@ MAX_WORDALIGN_SIZE = 5000000
## (assuming that each of them occupies up to 6 cores
NR_ALIGN_JOBS ?= $$(( ${CPU_CORES} / 6 + 1 ))
## job forcing doesn't work within recipes
# ${MAKE} -j ${NR_ALIGN_JOBS} $$a
${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
@ -250,7 +267,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
split -l ${MAX_WORDALIGN_SIZE} $(LOCAL_TRAIN_TRG).algtmp $(LOCAL_TRAIN_TRG).algtmp.d/; \
a=`ls $(LOCAL_TRAIN_SRC).algtmp.d/* | sed 's#$$#.alg#' | xargs`; \
if [ "$$a" != "" ]; then \
${MAKE} -j ${NR_ALIGN_JOBS} $$a; \
${MAKE} $$a; \
cat $(LOCAL_TRAIN_SRC).algtmp.d/*.alg | ${GZIP} -c > $@; \
rm -f ${LOCAL_TRAIN_SRC}.algtmp.d/*; \
rm -f ${LOCAL_TRAIN_TRG}.algtmp.d/*; \
@ -449,7 +466,7 @@ endif
# --> shuffle data for each langpair
# --> do this when FIT_DATA_SIZE is set!
######################################
ifneq (${SHUFFLE_DATA},1)
ifeq (${SHUFFLE_DATA},1)
@echo "shuffle training data"
@paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
@ -503,10 +520,10 @@ raw-devdata: ${DEV_SRC} ${DEV_TRG}
## maybe introduce over/undersampling of dev data like we have for train data?
${DEV_SRC}.shuffled.gz:
mkdir -p ${dir $@}
mkdir -p ${sort ${dir $@} ${dir ${DEV_SRC}} ${dir ${DEV_TRG}}}
rm -f ${DEV_SRC} ${DEV_TRG}
echo "# Validation data" > ${dir ${DEV_SRC}}/README.md
echo "" >> ${dir ${DEV_SRC}}/README.md
echo "# Validation data" > ${dir ${DEV_SRC}}README.md
echo "" >> ${dir ${DEV_SRC}}README.md
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \

View File

@ -7,12 +7,12 @@
TODAY := ${shell date +%F}
DATE ?= ${TODAY}
OBJECTSTORAGE = https://object.pouta.csc.fi
MODEL_CONTAINER = OPUS-MT-models
DEV_MODEL_CONTAINER = OPUS-MT-dev
MODELINDEX = ${OBJECTSTORAGE}/${MODEL_CONTAINER}/index.txt
MODELSHOME = ${WORKHOME}/models
RELEASEDIR = ${PWD}/models
OBJECTSTORAGE ?= https://object.pouta.csc.fi
MODEL_CONTAINER ?= OPUS-MT-models
DEV_MODEL_CONTAINER ?= OPUS-MT-dev
MODELINDEX ?= ${OBJECTSTORAGE}/${MODEL_CONTAINER}/index.txt
MODELSHOME ?= ${WORKHOME}/models
RELEASEDIR ?= ${PWD}/models
## TODO: better create a recipe for the yaml file and not the zip file
@ -41,7 +41,7 @@ find-model:
## minimum BLEU score for models to be accepted as distribution package
MIN_BLEU_SCORE = 20
MIN_BLEU_SCORE ?= 20
.PHONY: dist local-dist global-dist release

View File

@ -13,7 +13,7 @@ PWD ?= ${shell pwd}
NR_GPUS = 1
HPC_NODES = 1
HPC_DISK = 500
# HPC_DISK = 500
HPC_QUEUE = serial
HPC_GPUQUEUE = gpu
@ -81,8 +81,8 @@ TMPDIR ?= /tmp
## tools and their locations
SCRIPTDIR ?= ${PWD}/scripts
TOOLSDIR ?= ${PWD}/tools
SCRIPTDIR ?= ${REPOHOME}scripts
TOOLSDIR ?= ${REPOHOME}tools
ISO639 ?= ${shell which iso639 2>/dev/null || echo 'perl ${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639'}
PIGZ ?= ${shell which pigz 2>/dev/null || echo ${TOOLSDIR}/pigz/pigz}

View File

@ -274,8 +274,8 @@ endif
## --> make a new BPE/sentencepiece model
## --> make a new config file
DEFAULT_PIVOT_LANG = en
PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
DEFAULT_PIVOT_LANG ?= en
PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
%-pivotlang:
if [ "$(sort ${SRCLANGS} ${TRGLANGS} ${PIVOT_LANG})" != "$(sort ${SRCLANGS} ${TRGLANGS})" ]; then \
@ -316,6 +316,11 @@ endif
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
${@:-ft=}
## add forward translation of monolingual data
%-ftmono:
${MAKE} DATASET=${DATASET}+ftmono \
USE_FORWARDTRANSMONO=1 \
${@:-ftmono=}
## train on back-translations only

View File

@ -188,7 +188,6 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \
TESTSET_NAME=${TATOEBA_TESTSET_NAME} \
TRAINSET_NAME=${TATOEBA_TRAINSET_NAME} \
SMALLEST_TRAINSIZE=1000 \
DATA_IS_SHUFFLED=1 \
USE_REST_DEVDATA=0 \
HELDOUTSIZE=0 \
DEVSIZE=5000 \
@ -206,21 +205,24 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \
DEFAULT_PIVOT_LANG=${TATOEBA_PIVOT} \
MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU}
MARIAN_SHUFFLE=data
MARIAN_DATA_STORAGE=--sqlite
HPC_DISK=500
## unless we have multilingual models:
## no need to shuffle data again, just shuffle batches
## no need to store data in sqlite databases
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
# TATOEBA_PARAMS += MARIAN_SHUFFLE=batches MARIAN_DATA_STORAGE= HPC_DISK=
MARIAN_SHUFFLE=batches
MARIAN_DATA_STORAGE=
HPC_DISK=
endif
endif
## NEW (2012-12-15): use default (always shuffle training data)
#
# DATA_IS_SHUFFLED = 1
# MARIAN_SHUFFLE = data
# MARIAN_DATA_STORAGE = --sqlite
# HPC_DISK = 500
# ## unless we have multilingual models:
# ## no need to shuffle data again, just shuffle batches
# ## no need to store data in sqlite databases
# ifeq (${words ${SRCLANGS}},1)
# ifeq (${words ${TRGLANGS}},1)
# MARIAN_SHUFFLE = batches
# MARIAN_DATA_STORAGE =
# HPC_DISK =
# endif
# endif

View File

@ -126,16 +126,18 @@ SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.voc
mono-spm-vocab: ${SPMVOCAB}
ifneq (${SPMVOCAB},${SPMSRCVOCAB})
${SPMSRCVOCAB}:
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-vocab
${MAKE} LANGS="${SRCLANGS}" BPESIZE=${SRCBPESIZE} mono-spm-vocab
endif
ifneq (${SPMSRCVOCAB},${SPMTRGVOCAB})
ifneq (${SPMVOCAB},${SPMTRGVOCAB})
${SPMTRGVOCAB}:
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
${MAKE} LANGS="${TRGLANGS}" BPESIZE=${TRGBPESIZE} mono-spm-vocab
endif
endif
${SPMVOCAB}: ${LOCAL_MONO_DATA}.${PRE} ${SPMMODEL}
ifeq ($(wildcard ${SPMVOCAB}),)
@ -160,10 +162,12 @@ ifneq (${SPMMODEL},${SPMSRCMONO})
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
endif
ifneq (${SPMSRCMODEL},${SPMTRGMONO})
ifneq (${SPMMODEL},${SPMTRGMONO})
${SPMTRGMONO}:
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
endif
endif
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}

View File

@ -79,8 +79,8 @@ endif
%.eval: % ${TEST_TRG}
paste ${TEST_SRC}.${PRE_SRC} ${TEST_TRG} | grep $$'.\t' | cut -f2 > $@.ref
cat $< | sacrebleu ${SACREBLEU_PARAMS} $@.ref > $@
cat $< | sacrebleu ${SACREBLEU_PARAMS} --metrics=chrf --width=3 $@.ref >> $@
cat $< | sacrebleu -f text ${SACREBLEU_PARAMS} $@.ref > $@
cat $< | sacrebleu -f text ${SACREBLEU_PARAMS} --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref

View File

@ -180,12 +180,11 @@ endif
ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
MARIAN_ENC_DEPTH = 3
MARIAN_ENC_DEPTH = 6
MARIAN_DEC_DEPTH = 2
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 256
MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
--dec-cell ssru
MARIAN_DIM_EMB = 512
MARIAN_EXTRA += --transformer-decoder-autoreg rnn --dec-cell ssru
# --fp16
endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,624 @@
#
# backtranslate wiki data with Tatoeba-MT challenge data
#
# only works with sentencepiece models!
#
PWD := ${shell pwd}
REPOHOME := ${PWD}/../../
TOOLSDIR := ${REPOHOME}tools
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
SRC = fin
TRG = eng
## TODO: should use unshuffled versions and split into individual languages
## ---> otherwise we don't know the input language in case there are multiple ones
TATOEBA_RELEASE = v2020-07-28
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-${TATOEBA_RELEASE}
TATOEBA_WIKI_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt
TATOEBA_RELEASED_ALL = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
## container for storing backtranslations
BT_CONTAINER = Tatoeba-MT-bt
BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE ?= wikipedia
# WIKISOURCE ?= wiki
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH ?= 100
MAX_SENTENCES ?= ${SPLIT_SIZE}
LANGPAIR = ${SRC}-${TRG}
PWD := $(shell pwd)
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
endif
## macro-language IDs
## TODO: need to do something better than hard-coding this here
TATOEBA_MACRO_LANGS = hbs nor msa
## target languages of reliable models for current source language
## reliable is defined as BLEU scores above 20.0
##
TATOEBA_RELIABLE_TRG_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f2 -d-}
## alternative: chr-F2 >= 0.4
TATOEBA_RELIABLE_TRG_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f2 -d-}
## accept both
TATOEBA_RELIABLE_TRG = $(filter-out ${TATOEBA_MACRO_LANGS},$(sort ${TATOEBA_RELIABLE_TRG_BLEU} ${TATOEBA_RELIABLE_TRG_CHRF}))
#####################################################################################
#### TODO: find wiki languages that we can translate
#### PROBLEM: a wiki release may include several languages (like hbs, nor, ...)
#####################################################################################
## all "reliable" released tanslation models
# TATOEBA_AVAILABLE_NMT := ${shell wget -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u}
TATOEBA_RELIABLE_SRC_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f1 -d-}
TATOEBA_RELIABLE_SRC_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f1 -d-}
TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SRC_CHRF})
## TODO: is it OK to turn zho into cmn?
## NOTE: also needs to fix the grep pattern in recipe for ${WIKI_DIR}/${SRC} !!!!
TATOEBA_WIKILANGS := ${shell wget -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \
cut -f2 | sed 's/zho/cmn/' | sed 's/nor.*/nob/' | sort -u }
TATOEBA_TRANSLATABLE_WIKILANGS := ${filter ${TATOEBA_RELIABLE_SRC},${TATOEBA_WIKILANGS}}
TATOEBA_TRANSLATABLE_WIKILANGS3 := ${sort ${shell iso639 -m -n ${TATOEBA_TRANSLATABLE_WIKILANGS}}}
print-wikilangs:
@echo ${TATOEBA_RELIABLE_TRG}
# @echo ${TATOEBA_RELIABLE_SRC}
# @echo ${TATOEBA_TRANSLATABLE_WIKILANGS}
# @echo ${TATOEBA_TRANSLATABLE_WIKILANGS3}
#####################################################################################
#####################################################################################
#####################################################################################
### OBSOLETE??
## languages of released wikis
RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
## reverse list
RELEASED_WIKIS_REV = ${shell (for d in ${RELEASED_WIKIS}; do echo $$d; done) | tac}
WIKI_DIR = ${PWD}/wiki
LANGID = ${SRC}
PART = aa
OUTPUT_DIR = ${LANGPAIR}
WIKI_TXT = ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.${PART}.gz
WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
WIKI_LATEST_README = ${OUTPUT_DIR}/latest/README.md
## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,%,\
${wildcard ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.??.gz}}}
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = ${sort $(patsubst %.txt.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.txt.gz})) \
$(patsubst %.${PART}.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.${PART}.gz}))}
## targets for all parts of the current wiki source
ALLWIKIPARTS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,${PARTS}}
ALLWIKIPARTS_SRC = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${PARTS}}
ALLWIKIPARTS_PRE = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${PARTS}}
ALLWIKIPARTS_TRG = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${PARTS}}
ALLWIKIPARTS_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
ALLWIKIPARTS_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
## all wiki sources for the selected part
ALLWIKIS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/%.${PART}.gz,${WIKISOURCES}}
ALLWIKIS_SRC = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${WIKISOURCES}}
ALLWIKIS_PRE = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${WIKISOURCES}}
ALLWIKIS_TRG = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${WIKISOURCES}}
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
ifeq (${shell hostname --domain 2>/dev/null},bullx)
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
endif
.PHONY: all
all: translate
all-jobs: download
${MAKE} prepare-allwikis
${MAKE} translate-all-jobs
# all2eng:
# for w in ${filter-out eng,${RELEASED_WIKIS}}; do \
# make EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$w TRG=eng all-jobs; \
# done
## do only the ones that we do not have already!
new2trg:
for s in ${TATOEBA_TRANSLATABLE_WIKILANGS}; do \
if [ ! -d $$s-eng ]; then \
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$s TRG=${TRG} all-jobs; \
fi \
done
all2eng:
${MAKE} SRC=fin TRG=eng all2trg
all2trg:
for s in ${TATOEBA_TRANSLATABLE_WIKILANGS}; do \
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$s TRG=${TRG} all-jobs; \
done
## translate English to all reliable target languages
eng2all:
${MAKE} SRC=eng TRG=fin src2all
## translate current source language to all reliable target languages
src2all:
for t in ${TATOEBA_RELIABLE_TRG}; do \
if [ ! -e ${SRC}-$$t/latest/${WIKISOURCE}.${PART}.${SRC}-$$t.$$t.gz ]; then \
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=${SRC} TRG=$$t prepare; \
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=${SRC} TRG=$$t translate.${SUBMIT_PREFIX}; \
fi \
done
RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
fetch-bt:
for d in ${RELEASED_BT}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
fetch-all-bt:
for d in ${RELEASED_BT_ALL}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
#---------------------------------------------------------------
# release data
#---------------------------------------------------------------
release-all: upload-all
${MAKE} released-data.txt released-data-size.txt
.PHONY: upload release
release upload: ${WIKI_LATEST_README}
swift upload ${BT_CONTAINER} --changed --skip-identical ${LANGPAIR}/latest
${MAKE} released-data.txt
swift post ${BT_CONTAINER} --read-acl ".r:*"
.PHONY: upload-all
upload-all:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
make SRC=$$s TRG=$$t ${@:-all=}; \
done
released-data.txt: .
swift list ${BT_CONTAINER} | grep -v README.md | grep -v '.txt' > $@
swift upload ${BT_CONTAINER} $@
released-data-size.txt: .
${MAKE} check-latest-all | grep '^[0-9]' > $@
cat $@ | awk '{ sum += $$1 } END { print sum }' > $@.tmp
cat $@.tmp >> $@
rm -f cat $@.tmp
swift upload ${BT_CONTAINER} released-data-size.txt
# download released data
.PHONY: download
download: ${WIKI_DIR}/${SRC}
#---------------------------------------------------------------
# store / fetch translations
# (this is for storing work files and not for releasing data!)
#---------------------------------------------------------------
.PHONY: store
store:
a-put -b ${BT_WORK_CONTAINER} --nc --follow-links --override ${LANGPAIR}
.PHONY: store-all
store-all:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
make SRC=$$s TRG=$$t ${@:-all=}; \
done
.PHONY: retrieve fetch
retrieve fetch:
cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/${LANGPAIR}.tar
.PHONY: prepare
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${WIKI_TXT}
.PHONY: prepare-allwikis
prepare-allwikis: ${LANGPAIR}/${MODELNAME}/decoder.yml ${ALLWIKIS_TXT}
.PHONY: translate
translate: ${WIKI_LATEST_README} ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_SRC}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALLWIKIPARTS_LATEST_TRG}
${MAKE} ${ALLWIKIPARTS_LATEST_SRC}
## translate all wikis and all parts
.PHONY: translate-all
translate-all:
for s in ${WIKISOURCES}; do \
${MAKE} translate-allparts; \
done
## create jobs for translating all parts
## (only start the job if the file does not exist yet)
.PHONY: translate-all-parts-jobs
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
if [ ! -e ${OUTPUT_DIR}/${WIKISOURCE}.$${p}_${MODELNAME}.${LANGPAIR}.${TRG}.gz ]; then \
rm -f translate.${SUBMIT_PREFIX}; \
${MAKE} PART=$$p translate.${SUBMIT_PREFIX}; \
fi \
done
## create jobs for translating all parts of all wikis
.PHONY: translate-all-jobs
translate-all-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for s in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$s translate-all-parts-jobs; \
done
.PHONY: print-modelinfo
print-modelinfo:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo ${MODELINFO}
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
@echo "target language label: ${TARGET_LANG_LABEL}"
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
chmod +x ${dir $@}/preprocess.sh
endif
## pre-process data
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
else
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif
${WIKI_DIR}/${SRC}/%.${PART}.gz: | ${WIKI_DIR}/${SRC}/.done
${GZCAT} ${@:.${PART}.gz=.txt.gz} |\
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
rm -f ${@:.${PART}.gz=.txt.gz}
${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}/.done
echo "done!"
## NEW: get proper released WIKI data and extract the languages
## --> multiple languages can be included in one release (like nno in nor)
## --> shuffle the data as well
# fetch
${WIKI_DIR}/${SRC}/data:
mkdir -p ${dir $@}
wget -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar
tar -C ${dir $@} -xf $@.tar
rm -f $@.tar
# de-duplicate and shuffle
${WIKI_DIR}/${SRC}/${WIKISOURCE}.txt.gz:
${MAKE} ${WIKI_DIR}/${SRC}/data
for f in `find ${dir $@} -name '*.id.gz'`; do \
t=`echo $$f | sed 's/\.id\.gz/.txt.gz/'`; \
l=`echo ${SRC} | sed 's/cmn/zho/;s/nob/nor.*/'`; \
paste <(${GZIP} -cd $$f) <(${GZIP} -cd $$t) |\
grep "^$$l " | cut -f2 | grep . | \
${UNIQ} | ${SHUFFLE} | ${GZIP} -c > ${dir $@}`basename $$t`; \
done
rm -fr ${WIKI_DIR}/${SRC}/data
# remove empty files
${WIKI_DIR}/${SRC}/.done:
mkdir -p ${dir $@}
${MAKE} ${WIKI_DIR}/${SRC}/${WIKISOURCE}.txt.gz
for f in `find ${dir $@} -name '*.txt.gz'`; do \
if [ ! `${GZIP} -cd $$f | head | wc -l` -gt 0 ]; then \
rm -f $$f; \
fi \
done
touch $@
## OLD: retrieve the old shuffled wiki release
##
# ${WIKI_DIR}/${SRC}:
# mkdir -p $@
# wget -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar
# tar -C ${dir $@} -xf $@.tar
# if [ -d ${WIKI_DIR}/data/${SRC} ]; then \
# mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\
# rm -f ${WIKI_DIR}/data/${SRC}/*;\
# rmdir ${WIKI_DIR}/data/${SRC};\
# rmdir ${WIKI_DIR}/data;\
# fi
# if [ -e ${WIKI_DIR}/${SRC}/wiki.${SRC}.aa.gz ]; then \
# for f in `ls ${WIKI_DIR}/${SRC}/*.${SRC}.*`; do \
# mv $$f `echo $$f | sed 's/\.${SRC}\././'`; \
# done \
# fi
# rm -f $@.tar
${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/${SRC}/%.${PART}.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${GZCAT} $< |\
grep -v '[<>{}]' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
gzip -f > $@
endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@
endif
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${WIKI_LATEST_SRC}: ${WIKI_SRC}
mkdir -p ${dir $@}
cp $< $@
${WIKI_LATEST_TRG}: ${WIKI_TRG}
mkdir -p ${dir $@}
cp $< $@
${WIKI_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md
mkdir -p ${dir $@}
cp $< $@
## translate
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
-i ${PWD}/$< \
-c decoder.yml \
-d ${MARIAN_GPUS} \
--quiet-translation \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
#ifneq (${LANGPAIR},)
#ifneq (${MODELNAME},)
# rm -fr ${LANGPAIR}/${MODELNAME}
#endif
#endif
endif
check-latest:
@if [ -d ${LANGPAIR}/latest ]; then \
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
done \
fi
check-translated:
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
done
check-length:
@echo "check ${LANGPAIR}"
@${MAKE} check-translated
@${MAKE} check-latest
remove-%-all check-%-all:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
make SRC=$$s TRG=$$t ${@:-all=}; \
done
remove-incomplete:
${MAKE} remove-incomplete-translated
${MAKE} remove-incomplete-latest
remove-incomplete-translated:
@echo "check ${LANGPAIR}"
@mkdir -p ${LANGPAIR}/incomplete
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${LANGPAIR}/incomplete/; \
mv $$T ${LANGPAIR}/incomplete/; \
fi \
done
remove-incomplete-latest:
@echo "check ${LANGPAIR}"
@mkdir -p ${LANGPAIR}/incomplete/latest
@if [ -d ${LANGPAIR}/latest ]; then \
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${LANGPAIR}/incomplete/latest/; \
mv $$T ${LANGPAIR}/incomplete/latest/; \
fi \
done \
fi

View File

@ -0,0 +1,313 @@
#
# forward translation to be used for
# knowledge distillation
#
# only works with sentencepiece models!
#
# TODO's
#
# - forward-translate monolingual data (re-use bt-data)
# - reconstruction filtering (score translation in opposite direction)
# (use weights? normalise-script from bergamot/students)
# - other kind of data filtering / selection?
# - create lexical shortlists (see bergamot)
# - finetune alphas in intgemm8 models (see bergamot)
# - benchmark distilled models
#
PWD := ${shell pwd}
REPOHOME := ${PWD}/../../
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
SRC = fin
TRG = eng
## change decoder settings
## TODO: do we need this?
MARIAN_BEAM_SIZE=1
MARIAN_MINI_BATCH=100
MARIAN_MAXI_BATCH=100
MARIAN_MAX_LENGTH=200
MARIAN_WORKSPACE=12000
TATOEBA_VERSION ?= v2021-08-07
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
## container for storing backtranslations
BT_CONTAINER = Tatoeba-MT-bt
BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH ?= 200
MAX_SENTENCES ?= ${SPLIT_SIZE}
SORTLANGS = $(sort ${SRC} ${TRG})
LANGPAIR = ${SRC}-${TRG}
SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
PWD := $(shell pwd)
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
endif
RELEASED_BITEXTS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
RELEASED_BITEXTS_REV = ${shell (for d in ${RELEASED_BITEXTS}; do echo $$d; done) | tac}
PART ?= aa
OUTPUT_DIR ?= ${LANGPAIR}
BITEXT_DATADIR = ${PWD}/../work/data/simple
MODEL_WORKDIR = ${PWD}/../work/${LANGPAIR}
BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${SRC}.gz
BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz}
BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${LANGPAIR}
BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz
BITEXT_PRE = ${BITEXT_BASE}.${SRC}.spm.${PART}.gz
BITEXT_TRG = ${BITEXT_BASE}.${TRG}.${PART}.gz
BITEXT_LATEST_SRC = ${OUTPUT_DIR}/latest/Tatoeba-train.${PART}.${LANGPAIR}.${SRC}.gz
BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PART}.${LANGPAIR}.${TRG}.gz
BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
## all parts of the bitext
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
## don't delete translated text even if the process crashes
.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz
.PHONY: all
all: translate
.PHONY: prepare
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${BITEXT_PRE}
.PHONY: translate
translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
${MAKE} ${BITEXT_LATEST_SRC}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
${MAKE} source-all-parts
.PHONY: source-all-parts
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
.PHONY: print-modelinfo
print-modelinfo:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo ${MODELINFO}
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
@echo "target language label: ${TARGET_LANG_LABEL}"
## fetch the latest model
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
chmod +x ${dir $@}/preprocess.sh
endif
## pre-process data
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
else
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif
ifeq (${BITEXT_SRCPRE},)
${BITEXT_SRCRAW}:
${MAKE} -C .. SRCLANGS=${SRC} TRGLANGS=${TRG} clean-data-tatoeba
else
${BITEXT_SRCRAW}: ${BITEXT_SRCPRE}
sed 's/ //g;s/▁/ /g' < $< | sed 's/^ *//;s/ *$$//' | ${GZIP} -f > $@
endif
${BITEXT_PRE}: ${BITEXT_SRCRAW}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${GZCAT} $< |\
grep -v '[<>{}]' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \
mkdir -p ${dir $@}; \
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@; \
fi
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
mkdir -p ${dir $@}
cp $< $@
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
mkdir -p ${dir $@}
cp $< $@
${BITEXT_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md
mkdir -p ${dir $@}
cp $< $@
## translate
${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && \
${MARIAN_DECODER} \
-c decoder.yml \
-i ${PWD}/$< \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
endif
check-latest:
@if [ -d ${LANGPAIR}/latest ]; then \
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
done \
fi
check-translated:
@for S in `ls ${LANGPAIR}/*.${SRC}.spm.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
done
check-length:
@echo "check ${LANGPAIR}"
@${MAKE} check-translated
@${MAKE} check-latest
remove-%-all check-%-all:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
make SRC=$$s TRG=$$t ${@:-all=}; \
done
remove-incomplete:
${MAKE} remove-incomplete-translated
${MAKE} remove-incomplete-latest
remove-incomplete-translated:
@echo "check ${LANGPAIR}"
@mkdir -p ${LANGPAIR}/incomplete
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${LANGPAIR}/incomplete/; \
mv $$T ${LANGPAIR}/incomplete/; \
fi \
done
remove-incomplete-latest:
@echo "check ${LANGPAIR}"
@mkdir -p ${LANGPAIR}/incomplete/latest
@if [ -d ${LANGPAIR}/latest ]; then \
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${LANGPAIR}/incomplete/latest/; \
mv $$T ${LANGPAIR}/incomplete/latest/; \
fi \
done \
fi

File diff suppressed because it is too large Load Diff

302
tatoeba/pivoting/Makefile Normal file
View File

@ -0,0 +1,302 @@
#
# translate PIVOT language into SRC language
# to make a synthetic SRC-TRG corpus from another
# PIVOT-TRG corpus
PWD := ${shell pwd}
REPOHOME := ${PWD}/../../
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
SRC = swe
TRG = fin
PIVOT = eng
## change decoder settings
## TODO: do we need this?
MARIAN_BEAM_SIZE=1
MARIAN_MINI_BATCH=100
MARIAN_MAXI_BATCH=100
MARIAN_MAX_LENGTH=200
MARIAN_WORKSPACE=12000
TATOEBA_VERSION ?= v2021-08-07
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
## container for storing backtranslations
BT_CONTAINER = Tatoeba-MT-bt
BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH ?= 200
MAX_SENTENCES ?= ${SPLIT_SIZE}
TRANSLATE_LANGPAIR = ${PIVOT}-${SRC}
ORIGINAL_LANGPAIR = ${PIVOT}-${TRG}
NEW_LANGPAIR = ${SRC}-${TRG}
SORTLANGS = $(sort ${PIVOT} ${TRG})
SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
PART ?= aa
OUTPUT_DIR ?= ${NEW_LANGPAIR}
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MODELDIR = ${OUTPUT_DIR}/${TRANSLATE_LANGPAIR}/${MODELNAME}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
endif
BITEXT_DATADIR = ${PWD}/../work/data/simple
MODEL_WORKDIR = ${PWD}/../work/${PIVOT}-${TRG}
BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${PIVOT}.gz
BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz}
BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${PIVOT}-${SRC}-${TRG}
BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz
BITEXT_PRE = ${BITEXT_BASE}.${SRC}.spm.${PART}.gz
BITEXT_TRG = ${BITEXT_BASE}.${TRG}.${PART}.gz
BITEXT_LATEST_SRC = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.${PART}.${SRC}.gz
BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.${PART}.${TRG}.gz
BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
## all parts of the bitext
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz,${PARTS}}
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz,${PARTS}}
## don't delete translated text even if the process crashes
.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz
.PHONY: all
all: translate
.PHONY: prepare
prepare: ${MODELDIR}/decoder.yml ${BITEXT_PRE}
.PHONY: translate
translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
${MAKE} ${BITEXT_LATEST_SRC}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
${MAKE} source-all-parts
.PHONY: source-all-parts
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
.PHONY: print-modelinfo
print-modelinfo:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo ${MODELINFO}
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
@echo "target language label: ${TARGET_LANG_LABEL}"
## fetch the latest model
${MODELDIR}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
chmod +x ${dir $@}/preprocess.sh
endif
## pre-process data
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${SRC} ${TRG} ${MODELDIR}/source.spm
else
PREPROCESS_ARGS = ${SRC} ${MODELDIR}/source.spm
endif
ifeq (${BITEXT_SRCPRE},)
${BITEXT_SRCRAW}:
${MAKE} -C .. SRCLANGS=${PIVOT} TRGLANGS=${TRG} clean-data
else
${BITEXT_SRCRAW}: ${BITEXT_SRCPRE}
sed 's/ //g;s/▁/ /g' < $< | sed 's/^ *//;s/ *$$//' | ${GZIP} -f > $@
endif
${BITEXT_PRE}: ${BITEXT_SRCRAW}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${MODELDIR}/decoder.yml
${GZCAT} $< |\
grep -v '[<>{}]' |\
${MODELDIR}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \
mkdir -p ${dir $@}; \
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@; \
fi
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
mkdir -p ${dir $@}
cp $< $@
${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
mkdir -p ${dir $@}
cp $< $@
${BITEXT_LATEST_README}: ${MODELDIR}/README.md
mkdir -p ${dir $@}
cp $< $@
## translate
${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${MODELDIR}/decoder.yml
${LOAD_ENV} && cd ${MODELDIR} && \
${MARIAN_DECODER} \
-c decoder.yml \
-i ${PWD}/$< \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
endif
check-latest:
@if [ -d ${OUTPUT_DIR}/latest ]; then \
for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
done \
fi
check-translated:
@for S in `ls ${OUTPUT_DIR}/*.${SRC}.spm.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
done
check-length:
@echo "check ${OUTPUT_DIR}"
@${MAKE} check-translated
@${MAKE} check-latest
remove-%-all check-%-all:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
make SRC=$$s TRG=$$t ${@:-all=}; \
done
remove-incomplete:
${MAKE} remove-incomplete-translated
${MAKE} remove-incomplete-latest
remove-incomplete-translated:
@echo "check ${OUTPUT_DIR}"
@mkdir -p ${OUTPUT_DIR}/incomplete
@for S in `ls ${OUTPUT_DIR}/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${OUTPUT_DIR}/incomplete/; \
mv $$T ${OUTPUT_DIR}/incomplete/; \
fi \
done
remove-incomplete-latest:
@echo "check ${OUTPUT_DIR}"
@mkdir -p ${OUTPUT_DIR}/incomplete/latest
@if [ -d ${OUTPUT_DIR}/latest ]; then \
for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${OUTPUT_DIR}/incomplete/latest/; \
mv $$T ${OUTPUT_DIR}/incomplete/latest/; \
fi \
done \
fi