mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-03 23:57:47 +03:00
cleanup in tatoeba data recipes
This commit is contained in:
parent
5fc902b020
commit
d617a63c76
@ -4,6 +4,16 @@
|
||||
#
|
||||
# only works with sentencepiece models!
|
||||
#
|
||||
# TODO's
|
||||
#
|
||||
# - forward-translate monolingual data (re-use bt-data)
|
||||
# - reconstruction filtering (score translation in opposite direction)
|
||||
# (use weights? normalise-script from bergamot/students)
|
||||
# - other kind of data filtering / selection?
|
||||
# - create lexical shortlists (see bergamot)
|
||||
# - finetune alphas in intgemm8 models (see bergamot)
|
||||
# - benchmark distilled models
|
||||
#
|
||||
|
||||
PWD := ${shell pwd}
|
||||
REPOHOME := ${PWD}/../
|
||||
@ -112,7 +122,7 @@ translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
|
||||
## translate all parts
|
||||
.PHONY: translate-all-parts
|
||||
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
|
||||
${MAKE} ${ALL_BITEXT_LATEST_SRC}
|
||||
${MAKE} source-all-parts
|
||||
|
||||
.PHONY: source-all-parts
|
||||
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
|
||||
@ -183,16 +193,14 @@ endif
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
ifneq ($(wildcard ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@}),)
|
||||
mkdir -p ${dir $@}
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
gzip -c > $@
|
||||
endif
|
||||
|
||||
|
||||
if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \
|
||||
mkdir -p ${dir $@}; \
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
gzip -c > $@; \
|
||||
fi
|
||||
|
||||
|
||||
## overwrite the file with the latest translations
|
||||
|
@ -88,9 +88,12 @@ SKIP_SAME_LANG ?= 0
|
||||
## --> especially useful in connection with FIT_DATA_SIZE
|
||||
## set DATA_IS_SHUFFLED=1 if the training data is already shuffled
|
||||
## --> useful to avoid shuffling when training sentence piece model
|
||||
## NEW (2021-12-16): SHUFFLE_DATA is now set by default
|
||||
## --> can now also avoid sqlite and data shuffling inside MarianNMT
|
||||
## --> is that a problem (would MarianNMT use different random shuffles / epoch?)
|
||||
##----------------------------------------------------------------------
|
||||
|
||||
# SHUFFLE_DATA = 1
|
||||
SHUFFLE_DATA = 1
|
||||
# DATA_IS_SHUFFLED = 1
|
||||
|
||||
## devtest data is shuffled by default
|
||||
@ -142,9 +145,9 @@ SORTSRC = ${firstword ${SORTLANGS}}
|
||||
SORTTRG = ${lastword ${SORTLANGS}}
|
||||
LANGPAIR = ${SORTSRC}-${SORTTRG}
|
||||
SPACE = $(empty) $(empty)
|
||||
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
|
||||
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
|
||||
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
|
||||
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
|
||||
|
||||
|
||||
## for monolingual things
|
||||
@ -179,17 +182,17 @@ endif
|
||||
## NEW default size = 2500 (keep more for training for small languages)
|
||||
## NOTE: size will be increased to 5000 for Tatoeba
|
||||
|
||||
DEVSIZE = 2500
|
||||
TESTSIZE = 2500
|
||||
DEVSIZE ?= 2500
|
||||
TESTSIZE ?= 2500
|
||||
|
||||
## set some additional thresholds for
|
||||
## the size of test and dev data
|
||||
## DEVMINSIZE is the absolute minimum we require
|
||||
## to run any training procedures
|
||||
|
||||
DEVSMALLSIZE = 1000
|
||||
TESTSMALLSIZE = 1000
|
||||
DEVMINSIZE = 250
|
||||
DEVSMALLSIZE ?= 1000
|
||||
TESTSMALLSIZE ?= 1000
|
||||
DEVMINSIZE ?= 250
|
||||
|
||||
|
||||
## set additional argument options for opus_read (if it is used)
|
||||
@ -486,12 +489,14 @@ MARIAN_CLIP_NORM ?= 5
|
||||
|
||||
## default = shuffle data and batches
|
||||
## (set to batches or none to change this)
|
||||
MARIAN_SHUFFLE ?= data
|
||||
# MARIAN_SHUFFLE ?= data
|
||||
MARIAN_SHUFFLE ?= batches
|
||||
|
||||
## default: use sqlite database to store data
|
||||
## remove this to use regular temp data
|
||||
## set to --shuffle-in-ram to keep all shuffled data in RAM
|
||||
MARIAN_DATA_STORAGE ?= --sqlite
|
||||
# MARIAN_DATA_STORAGE ?= --sqlite
|
||||
|
||||
|
||||
## set to global for lower memory usage in multiprocess training
|
||||
## TODO: does this parameter really work?
|
||||
@ -596,11 +601,11 @@ endif
|
||||
.PHONY: config local-config
|
||||
config local-config: ${WORKDIR}/${MODELCONFIG}
|
||||
|
||||
SMALLEST_TRAINSIZE = 10000
|
||||
SMALL_TRAINSIZE = 100000
|
||||
MEDIUM_TRAINSIZE = 500000
|
||||
LARGE_TRAINSIZE = 1000000
|
||||
LARGEST_TRAINSIZE = 10000000
|
||||
SMALLEST_TRAINSIZE ?= 10000
|
||||
SMALL_TRAINSIZE ?= 100000
|
||||
MEDIUM_TRAINSIZE ?= 500000
|
||||
LARGE_TRAINSIZE ?= 1000000
|
||||
LARGEST_TRAINSIZE ?= 10000000
|
||||
|
||||
${WORKDIR}/${MODELCONFIG}:
|
||||
mkdir -p ${dir $@}
|
||||
|
35
lib/data.mk
35
lib/data.mk
@ -55,9 +55,8 @@ endif
|
||||
## - use only the latest backtranslations
|
||||
## if such a subdir exists
|
||||
|
||||
BACKTRANS_HOME = backtranslate
|
||||
FORWARDTRANS_HOME = ${BACKTRANS_HOME}
|
||||
# FORWARDTRANS_HOME = ${BACKTRANS_HOME}
|
||||
BACKTRANS_HOME ?= backtranslate
|
||||
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
|
||||
|
||||
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)
|
||||
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}/latest
|
||||
@ -71,6 +70,13 @@ else
|
||||
FORWARDTRANS_DIR = ${FORWARDTRANS_HOME}/${SRC}-${TRG}
|
||||
endif
|
||||
|
||||
ifneq (${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest},)
|
||||
FORWARDTRANSMONO_DIR = ${BACKTRANS_HOME}/${SRC}-${TRG}/latest
|
||||
else
|
||||
FORWARDTRANSMONO_DIR = ${BACKTRANS_HOME}/${SRC}-${TRG}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## TODO: make it possible to select only parts of the BT data
|
||||
## ---> use TRAINDATA_SIZE to take max the same amount of all shuffled BT data
|
||||
@ -85,6 +91,11 @@ ifeq (${USE_FORWARDTRANS},1)
|
||||
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
|
||||
endif
|
||||
|
||||
ifeq (${USE_FORWARDTRANSMONO},1)
|
||||
FORWARDTRANSMONO_SRC = ${sort ${wildcard ${FORWARDTRANSMONO_DIR}/*.${SRCEXT}.gz}}
|
||||
FORWARDTRANSMONO_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANSMONO_SRC}}
|
||||
endif
|
||||
|
||||
ifeq (${USE_PIVOTING},1)
|
||||
PIVOTING_SRC = ${sort ${wildcard pivoting/${SRC}-${TRG}/latest/*.${SRCEXT}.gz} \
|
||||
${wildcard pivoting/${TRG}-${SRC}/latest/*.${SRCEXT}.gz}}
|
||||
@ -95,6 +106,10 @@ print-ft-data:
|
||||
@echo ${FORWARDTRANS_SRC}
|
||||
@echo ${FORWARDTRANS_TRG}
|
||||
@echo ${FORWARDTRANS_DIR}
|
||||
@echo ${FORWARDTRANSMONO_SRC}
|
||||
@echo ${FORWARDTRANSMONO_TRG}
|
||||
@echo ${FORWARDTRANSMONO_DIR}
|
||||
|
||||
|
||||
##-------------------------------------------------------------
|
||||
## data sets (train/dev/test)
|
||||
@ -104,7 +119,7 @@ print-ft-data:
|
||||
## with some basic pre-processing (see lib/preprocess.mk)
|
||||
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \
|
||||
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${PIVOTING_SRC}
|
||||
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
|
||||
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}}
|
||||
@ -239,6 +254,8 @@ MAX_WORDALIGN_SIZE = 5000000
|
||||
## (assuming that each of them occupies up to 6 cores
|
||||
NR_ALIGN_JOBS ?= $$(( ${CPU_CORES} / 6 + 1 ))
|
||||
|
||||
## job forcing doesn't work within recipes
|
||||
# ${MAKE} -j ${NR_ALIGN_JOBS} $$a
|
||||
|
||||
${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
|
||||
@ -250,7 +267,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
split -l ${MAX_WORDALIGN_SIZE} $(LOCAL_TRAIN_TRG).algtmp $(LOCAL_TRAIN_TRG).algtmp.d/; \
|
||||
a=`ls $(LOCAL_TRAIN_SRC).algtmp.d/* | sed 's#$$#.alg#' | xargs`; \
|
||||
if [ "$$a" != "" ]; then \
|
||||
${MAKE} -j ${NR_ALIGN_JOBS} $$a; \
|
||||
${MAKE} $$a; \
|
||||
cat $(LOCAL_TRAIN_SRC).algtmp.d/*.alg | ${GZIP} -c > $@; \
|
||||
rm -f ${LOCAL_TRAIN_SRC}.algtmp.d/*; \
|
||||
rm -f ${LOCAL_TRAIN_TRG}.algtmp.d/*; \
|
||||
@ -449,7 +466,7 @@ endif
|
||||
# --> shuffle data for each langpair
|
||||
# --> do this when FIT_DATA_SIZE is set!
|
||||
######################################
|
||||
ifneq (${SHUFFLE_DATA},1)
|
||||
ifeq (${SHUFFLE_DATA},1)
|
||||
@echo "shuffle training data"
|
||||
@paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
|
||||
${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
|
||||
@ -503,10 +520,10 @@ raw-devdata: ${DEV_SRC} ${DEV_TRG}
|
||||
## maybe introduce over/undersampling of dev data like we have for train data?
|
||||
|
||||
${DEV_SRC}.shuffled.gz:
|
||||
mkdir -p ${dir $@}
|
||||
mkdir -p ${sort ${dir $@} ${dir ${DEV_SRC}} ${dir ${DEV_TRG}}}
|
||||
rm -f ${DEV_SRC} ${DEV_TRG}
|
||||
echo "# Validation data" > ${dir ${DEV_SRC}}/README.md
|
||||
echo "" >> ${dir ${DEV_SRC}}/README.md
|
||||
echo "# Validation data" > ${dir ${DEV_SRC}}README.md
|
||||
echo "" >> ${dir ${DEV_SRC}}README.md
|
||||
-for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
|
||||
|
14
lib/dist.mk
14
lib/dist.mk
@ -7,12 +7,12 @@
|
||||
TODAY := ${shell date +%F}
|
||||
DATE ?= ${TODAY}
|
||||
|
||||
OBJECTSTORAGE = https://object.pouta.csc.fi
|
||||
MODEL_CONTAINER = OPUS-MT-models
|
||||
DEV_MODEL_CONTAINER = OPUS-MT-dev
|
||||
MODELINDEX = ${OBJECTSTORAGE}/${MODEL_CONTAINER}/index.txt
|
||||
MODELSHOME = ${WORKHOME}/models
|
||||
RELEASEDIR = ${PWD}/models
|
||||
OBJECTSTORAGE ?= https://object.pouta.csc.fi
|
||||
MODEL_CONTAINER ?= OPUS-MT-models
|
||||
DEV_MODEL_CONTAINER ?= OPUS-MT-dev
|
||||
MODELINDEX ?= ${OBJECTSTORAGE}/${MODEL_CONTAINER}/index.txt
|
||||
MODELSHOME ?= ${WORKHOME}/models
|
||||
RELEASEDIR ?= ${PWD}/models
|
||||
|
||||
|
||||
## TODO: better create a recipe for the yaml file and not the zip file
|
||||
@ -41,7 +41,7 @@ find-model:
|
||||
|
||||
|
||||
## minimum BLEU score for models to be accepted as distribution package
|
||||
MIN_BLEU_SCORE = 20
|
||||
MIN_BLEU_SCORE ?= 20
|
||||
|
||||
.PHONY: dist local-dist global-dist release
|
||||
|
||||
|
@ -13,7 +13,7 @@ PWD ?= ${shell pwd}
|
||||
|
||||
NR_GPUS = 1
|
||||
HPC_NODES = 1
|
||||
HPC_DISK = 500
|
||||
# HPC_DISK = 500
|
||||
HPC_QUEUE = serial
|
||||
HPC_GPUQUEUE = gpu
|
||||
|
||||
@ -81,8 +81,8 @@ TMPDIR ?= /tmp
|
||||
|
||||
## tools and their locations
|
||||
|
||||
SCRIPTDIR ?= ${PWD}/scripts
|
||||
TOOLSDIR ?= ${PWD}/tools
|
||||
SCRIPTDIR ?= ${REPOHOME}scripts
|
||||
TOOLSDIR ?= ${REPOHOME}tools
|
||||
|
||||
ISO639 ?= ${shell which iso639 2>/dev/null || echo 'perl ${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639'}
|
||||
PIGZ ?= ${shell which pigz 2>/dev/null || echo ${TOOLSDIR}/pigz/pigz}
|
||||
|
@ -274,8 +274,8 @@ endif
|
||||
## --> make a new BPE/sentencepiece model
|
||||
## --> make a new config file
|
||||
|
||||
DEFAULT_PIVOT_LANG = en
|
||||
PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
|
||||
DEFAULT_PIVOT_LANG ?= en
|
||||
PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
|
||||
|
||||
%-pivotlang:
|
||||
if [ "$(sort ${SRCLANGS} ${TRGLANGS} ${PIVOT_LANG})" != "$(sort ${SRCLANGS} ${TRGLANGS})" ]; then \
|
||||
@ -316,6 +316,11 @@ endif
|
||||
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
|
||||
${@:-ft=}
|
||||
|
||||
## add forward translation of monolingual data
|
||||
%-ftmono:
|
||||
${MAKE} DATASET=${DATASET}+ftmono \
|
||||
USE_FORWARDTRANSMONO=1 \
|
||||
${@:-ftmono=}
|
||||
|
||||
|
||||
## train on back-translations only
|
||||
|
@ -188,7 +188,6 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \
|
||||
TESTSET_NAME=${TATOEBA_TESTSET_NAME} \
|
||||
TRAINSET_NAME=${TATOEBA_TRAINSET_NAME} \
|
||||
SMALLEST_TRAINSIZE=1000 \
|
||||
DATA_IS_SHUFFLED=1 \
|
||||
USE_REST_DEVDATA=0 \
|
||||
HELDOUTSIZE=0 \
|
||||
DEVSIZE=5000 \
|
||||
@ -206,21 +205,24 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \
|
||||
DEFAULT_PIVOT_LANG=${TATOEBA_PIVOT} \
|
||||
MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU}
|
||||
|
||||
MARIAN_SHUFFLE=data
|
||||
MARIAN_DATA_STORAGE=--sqlite
|
||||
HPC_DISK=500
|
||||
|
||||
## unless we have multilingual models:
|
||||
## no need to shuffle data again, just shuffle batches
|
||||
## no need to store data in sqlite databases
|
||||
ifeq (${words ${SRCLANGS}},1)
|
||||
ifeq (${words ${TRGLANGS}},1)
|
||||
# TATOEBA_PARAMS += MARIAN_SHUFFLE=batches MARIAN_DATA_STORAGE= HPC_DISK=
|
||||
MARIAN_SHUFFLE=batches
|
||||
MARIAN_DATA_STORAGE=
|
||||
HPC_DISK=
|
||||
endif
|
||||
endif
|
||||
## NEW (2012-12-15): use default (always shuffle training data)
|
||||
#
|
||||
# DATA_IS_SHUFFLED = 1
|
||||
# MARIAN_SHUFFLE = data
|
||||
# MARIAN_DATA_STORAGE = --sqlite
|
||||
# HPC_DISK = 500
|
||||
|
||||
# ## unless we have multilingual models:
|
||||
# ## no need to shuffle data again, just shuffle batches
|
||||
# ## no need to store data in sqlite databases
|
||||
# ifeq (${words ${SRCLANGS}},1)
|
||||
# ifeq (${words ${TRGLANGS}},1)
|
||||
# MARIAN_SHUFFLE = batches
|
||||
# MARIAN_DATA_STORAGE =
|
||||
# HPC_DISK =
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
|
@ -126,16 +126,18 @@ SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.voc
|
||||
|
||||
mono-spm-vocab: ${SPMVOCAB}
|
||||
|
||||
|
||||
ifneq (${SPMVOCAB},${SPMSRCVOCAB})
|
||||
${SPMSRCVOCAB}:
|
||||
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-vocab
|
||||
${MAKE} LANGS="${SRCLANGS}" BPESIZE=${SRCBPESIZE} mono-spm-vocab
|
||||
endif
|
||||
|
||||
ifneq (${SPMSRCVOCAB},${SPMTRGVOCAB})
|
||||
ifneq (${SPMVOCAB},${SPMTRGVOCAB})
|
||||
${SPMTRGVOCAB}:
|
||||
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-vocab
|
||||
${MAKE} LANGS="${TRGLANGS}" BPESIZE=${TRGBPESIZE} mono-spm-vocab
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
${SPMVOCAB}: ${LOCAL_MONO_DATA}.${PRE} ${SPMMODEL}
|
||||
ifeq ($(wildcard ${SPMVOCAB}),)
|
||||
@ -160,10 +162,12 @@ ifneq (${SPMMODEL},${SPMSRCMONO})
|
||||
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
|
||||
endif
|
||||
|
||||
ifneq (${SPMSRCMODEL},${SPMTRGMONO})
|
||||
ifneq (${SPMMODEL},${SPMTRGMONO})
|
||||
${SPMTRGMONO}:
|
||||
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
|
||||
|
@ -79,8 +79,8 @@ endif
|
||||
|
||||
%.eval: % ${TEST_TRG}
|
||||
paste ${TEST_SRC}.${PRE_SRC} ${TEST_TRG} | grep $$'.\t' | cut -f2 > $@.ref
|
||||
cat $< | sacrebleu ${SACREBLEU_PARAMS} $@.ref > $@
|
||||
cat $< | sacrebleu ${SACREBLEU_PARAMS} --metrics=chrf --width=3 $@.ref >> $@
|
||||
cat $< | sacrebleu -f text ${SACREBLEU_PARAMS} $@.ref > $@
|
||||
cat $< | sacrebleu -f text ${SACREBLEU_PARAMS} --metrics=chrf --width=3 $@.ref >> $@
|
||||
rm -f $@.ref
|
||||
|
||||
|
||||
|
@ -180,12 +180,11 @@ endif
|
||||
|
||||
|
||||
ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
|
||||
MARIAN_ENC_DEPTH = 3
|
||||
MARIAN_ENC_DEPTH = 6
|
||||
MARIAN_DEC_DEPTH = 2
|
||||
MARIAN_ATT_HEADS = 8
|
||||
MARIAN_DIM_EMB = 256
|
||||
MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
|
||||
--dec-cell ssru
|
||||
MARIAN_DIM_EMB = 512
|
||||
MARIAN_EXTRA += --transformer-decoder-autoreg rnn --dec-cell ssru
|
||||
# --fp16
|
||||
endif
|
||||
|
||||
|
1348
tatoeba/Makefile
1348
tatoeba/Makefile
File diff suppressed because it is too large
Load Diff
624
tatoeba/back-translate/Makefile
Normal file
624
tatoeba/back-translate/Makefile
Normal file
@ -0,0 +1,624 @@
|
||||
#
|
||||
# backtranslate wiki data with Tatoeba-MT challenge data
|
||||
#
|
||||
# only works with sentencepiece models!
|
||||
#
|
||||
|
||||
PWD := ${shell pwd}
|
||||
REPOHOME := ${PWD}/../../
|
||||
TOOLSDIR := ${REPOHOME}tools
|
||||
|
||||
include ${REPOHOME}lib/env.mk
|
||||
include ${REPOHOME}lib/config.mk
|
||||
include ${REPOHOME}lib/slurm.mk
|
||||
|
||||
|
||||
SRC = fin
|
||||
TRG = eng
|
||||
|
||||
|
||||
## TODO: should use unshuffled versions and split into individual languages
|
||||
## ---> otherwise we don't know the input language in case there are multiple ones
|
||||
|
||||
TATOEBA_RELEASE = v2020-07-28
|
||||
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-${TATOEBA_RELEASE}
|
||||
TATOEBA_WIKI_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
|
||||
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt
|
||||
TATOEBA_RELEASED_ALL = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
|
||||
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
|
||||
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
|
||||
|
||||
|
||||
|
||||
## container for storing backtranslations
|
||||
BT_CONTAINER = Tatoeba-MT-bt
|
||||
BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
|
||||
|
||||
## various sources are available
|
||||
## can be general wikipedia, wikinews, wikibooks, ...
|
||||
WIKISOURCE ?= wikipedia
|
||||
# WIKISOURCE ?= wiki
|
||||
|
||||
## split size in nr-of-lines
|
||||
## default part to be selected = aa
|
||||
SPLIT_SIZE ?= 1000000
|
||||
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
## maximum number of sentences to be translated (top N lines)
|
||||
MAX_LENGTH ?= 100
|
||||
MAX_SENTENCES ?= ${SPLIT_SIZE}
|
||||
|
||||
|
||||
LANGPAIR = ${SRC}-${TRG}
|
||||
|
||||
PWD := $(shell pwd)
|
||||
|
||||
|
||||
|
||||
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
|
||||
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4}
|
||||
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
|
||||
ifneq (${MULTI_TARGET_MODEL},0)
|
||||
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
|
||||
endif
|
||||
|
||||
|
||||
## macro-language IDs
|
||||
## TODO: need to do something better than hard-coding this here
|
||||
TATOEBA_MACRO_LANGS = hbs nor msa
|
||||
|
||||
|
||||
## target languages of reliable models for current source language
|
||||
## reliable is defined as BLEU scores above 20.0
|
||||
##
|
||||
TATOEBA_RELIABLE_TRG_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
|
||||
egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f2 -d-}
|
||||
|
||||
## alternative: chr-F2 >= 0.4
|
||||
TATOEBA_RELIABLE_TRG_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
|
||||
egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f2 -d-}
|
||||
|
||||
## accept both
|
||||
TATOEBA_RELIABLE_TRG = $(filter-out ${TATOEBA_MACRO_LANGS},$(sort ${TATOEBA_RELIABLE_TRG_BLEU} ${TATOEBA_RELIABLE_TRG_CHRF}))
|
||||
|
||||
|
||||
#####################################################################################
|
||||
#### TODO: find wiki languages that we can translate
|
||||
#### PROBLEM: a wiki release may include several languages (like hbs, nor, ...)
|
||||
#####################################################################################
|
||||
|
||||
## all "reliable" released tanslation models
|
||||
# TATOEBA_AVAILABLE_NMT := ${shell wget -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u}
|
||||
|
||||
TATOEBA_RELIABLE_SRC_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
|
||||
egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f1 -d-}
|
||||
|
||||
TATOEBA_RELIABLE_SRC_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
|
||||
egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f1 -d-}
|
||||
|
||||
TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SRC_CHRF})
|
||||
|
||||
|
||||
## TODO: is it OK to turn zho into cmn?
|
||||
## NOTE: also needs to fix the grep pattern in recipe for ${WIKI_DIR}/${SRC} !!!!
|
||||
TATOEBA_WIKILANGS := ${shell wget -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \
|
||||
cut -f2 | sed 's/zho/cmn/' | sed 's/nor.*/nob/' | sort -u }
|
||||
|
||||
TATOEBA_TRANSLATABLE_WIKILANGS := ${filter ${TATOEBA_RELIABLE_SRC},${TATOEBA_WIKILANGS}}
|
||||
TATOEBA_TRANSLATABLE_WIKILANGS3 := ${sort ${shell iso639 -m -n ${TATOEBA_TRANSLATABLE_WIKILANGS}}}
|
||||
|
||||
print-wikilangs:
|
||||
@echo ${TATOEBA_RELIABLE_TRG}
|
||||
|
||||
# @echo ${TATOEBA_RELIABLE_SRC}
|
||||
# @echo ${TATOEBA_TRANSLATABLE_WIKILANGS}
|
||||
# @echo ${TATOEBA_TRANSLATABLE_WIKILANGS3}
|
||||
|
||||
|
||||
#####################################################################################
|
||||
#####################################################################################
|
||||
#####################################################################################
|
||||
|
||||
|
||||
### OBSOLETE??
|
||||
## languages of released wikis
|
||||
RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
|
||||
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
|
||||
|
||||
## reverse list
|
||||
RELEASED_WIKIS_REV = ${shell (for d in ${RELEASED_WIKIS}; do echo $$d; done) | tac}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
WIKI_DIR = ${PWD}/wiki
|
||||
LANGID = ${SRC}
|
||||
PART = aa
|
||||
OUTPUT_DIR = ${LANGPAIR}
|
||||
WIKI_TXT = ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.${PART}.gz
|
||||
WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
|
||||
WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
|
||||
WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
||||
|
||||
WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
|
||||
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
|
||||
WIKI_LATEST_README = ${OUTPUT_DIR}/latest/README.md
|
||||
|
||||
## all parts of this wiki
|
||||
PARTS = ${sort ${patsubst ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,%,\
|
||||
${wildcard ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.??.gz}}}
|
||||
|
||||
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
||||
WIKISOURCES = ${sort $(patsubst %.txt.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.txt.gz})) \
|
||||
$(patsubst %.${PART}.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.${PART}.gz}))}
|
||||
|
||||
|
||||
## targets for all parts of the current wiki source
|
||||
|
||||
ALLWIKIPARTS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,${PARTS}}
|
||||
ALLWIKIPARTS_SRC = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${PARTS}}
|
||||
ALLWIKIPARTS_PRE = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${PARTS}}
|
||||
ALLWIKIPARTS_TRG = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${PARTS}}
|
||||
|
||||
ALLWIKIPARTS_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
|
||||
ALLWIKIPARTS_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
|
||||
|
||||
|
||||
## all wiki sources for the selected part
|
||||
|
||||
ALLWIKIS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/%.${PART}.gz,${WIKISOURCES}}
|
||||
ALLWIKIS_SRC = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${WIKISOURCES}}
|
||||
ALLWIKIS_PRE = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${WIKISOURCES}}
|
||||
ALLWIKIS_TRG = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${WIKISOURCES}}
|
||||
|
||||
|
||||
|
||||
## don't delete translated text if the process crashes
|
||||
.PRECIOUS: ${WIKI_TRG}
|
||||
|
||||
|
||||
ifdef LOCAL_SCRATCH
|
||||
TMPDIR = ${LOCAL_SCRATCH}
|
||||
endif
|
||||
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
||||
module load nlpl-udpipe nlpl-opus &&
|
||||
endif
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: translate
|
||||
|
||||
all-jobs: download
|
||||
${MAKE} prepare-allwikis
|
||||
${MAKE} translate-all-jobs
|
||||
|
||||
# all2eng:
|
||||
# for w in ${filter-out eng,${RELEASED_WIKIS}}; do \
|
||||
# make EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$w TRG=eng all-jobs; \
|
||||
# done
|
||||
|
||||
|
||||
## do only the ones that we do not have already!
|
||||
|
||||
new2trg:
|
||||
for s in ${TATOEBA_TRANSLATABLE_WIKILANGS}; do \
|
||||
if [ ! -d $$s-eng ]; then \
|
||||
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$s TRG=${TRG} all-jobs; \
|
||||
fi \
|
||||
done
|
||||
|
||||
all2eng:
|
||||
${MAKE} SRC=fin TRG=eng all2trg
|
||||
|
||||
all2trg:
|
||||
for s in ${TATOEBA_TRANSLATABLE_WIKILANGS}; do \
|
||||
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=$$s TRG=${TRG} all-jobs; \
|
||||
done
|
||||
|
||||
|
||||
## translate English to all reliable target languages
|
||||
eng2all:
|
||||
${MAKE} SRC=eng TRG=fin src2all
|
||||
|
||||
|
||||
## translate current source language to all reliable target languages
|
||||
src2all:
|
||||
for t in ${TATOEBA_RELIABLE_TRG}; do \
|
||||
if [ ! -e ${SRC}-$$t/latest/${WIKISOURCE}.${PART}.${SRC}-$$t.$$t.gz ]; then \
|
||||
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=${SRC} TRG=$$t prepare; \
|
||||
${MAKE} EMAIL= HPC_CORES=128 HPC_MEM=160g HPC_TIME=24:00 SRC=${SRC} TRG=$$t translate.${SUBMIT_PREFIX}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
|
||||
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
|
||||
|
||||
fetch-bt:
|
||||
for d in ${RELEASED_BT}; do \
|
||||
echo "fetch $$d"; \
|
||||
mkdir -p `dirname $$d`; \
|
||||
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
|
||||
done
|
||||
|
||||
fetch-all-bt:
|
||||
for d in ${RELEASED_BT_ALL}; do \
|
||||
echo "fetch $$d"; \
|
||||
mkdir -p `dirname $$d`; \
|
||||
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
|
||||
done
|
||||
|
||||
|
||||
#---------------------------------------------------------------
|
||||
# release data
|
||||
#---------------------------------------------------------------
|
||||
|
||||
release-all: upload-all
|
||||
${MAKE} released-data.txt released-data-size.txt
|
||||
|
||||
.PHONY: upload release
|
||||
release upload: ${WIKI_LATEST_README}
|
||||
swift upload ${BT_CONTAINER} --changed --skip-identical ${LANGPAIR}/latest
|
||||
${MAKE} released-data.txt
|
||||
swift post ${BT_CONTAINER} --read-acl ".r:*"
|
||||
|
||||
.PHONY: upload-all
|
||||
upload-all:
|
||||
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
||||
s=`echo $$d | cut -f1 -d'-'`; \
|
||||
t=`echo $$d | cut -f2 -d'-'`; \
|
||||
make SRC=$$s TRG=$$t ${@:-all=}; \
|
||||
done
|
||||
|
||||
released-data.txt: .
|
||||
swift list ${BT_CONTAINER} | grep -v README.md | grep -v '.txt' > $@
|
||||
swift upload ${BT_CONTAINER} $@
|
||||
|
||||
released-data-size.txt: .
|
||||
${MAKE} check-latest-all | grep '^[0-9]' > $@
|
||||
cat $@ | awk '{ sum += $$1 } END { print sum }' > $@.tmp
|
||||
cat $@.tmp >> $@
|
||||
rm -f cat $@.tmp
|
||||
swift upload ${BT_CONTAINER} released-data-size.txt
|
||||
|
||||
# download released data
|
||||
|
||||
.PHONY: download
|
||||
download: ${WIKI_DIR}/${SRC}
|
||||
|
||||
|
||||
#---------------------------------------------------------------
|
||||
# store / fetch translations
|
||||
# (this is for storing work files and not for releasing data!)
|
||||
#---------------------------------------------------------------
|
||||
|
||||
.PHONY: store
|
||||
store:
|
||||
a-put -b ${BT_WORK_CONTAINER} --nc --follow-links --override ${LANGPAIR}
|
||||
|
||||
.PHONY: store-all
|
||||
store-all:
|
||||
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
||||
s=`echo $$d | cut -f1 -d'-'`; \
|
||||
t=`echo $$d | cut -f2 -d'-'`; \
|
||||
make SRC=$$s TRG=$$t ${@:-all=}; \
|
||||
done
|
||||
|
||||
.PHONY: retrieve fetch
|
||||
retrieve fetch:
|
||||
cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/${LANGPAIR}.tar
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: prepare
|
||||
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${WIKI_TXT}
|
||||
|
||||
.PHONY: prepare-allwikis
|
||||
prepare-allwikis: ${LANGPAIR}/${MODELNAME}/decoder.yml ${ALLWIKIS_TXT}
|
||||
|
||||
.PHONY: translate
|
||||
translate: ${WIKI_LATEST_README} ${WIKI_LATEST_TRG}
|
||||
${MAKE} ${WIKI_LATEST_SRC}
|
||||
|
||||
## translate all parts
|
||||
.PHONY: translate-all-parts
|
||||
translate-all-parts: ${ALLWIKIPARTS_LATEST_TRG}
|
||||
${MAKE} ${ALLWIKIPARTS_LATEST_SRC}
|
||||
|
||||
## translate all wikis and all parts
|
||||
.PHONY: translate-all
|
||||
translate-all:
|
||||
for s in ${WIKISOURCES}; do \
|
||||
${MAKE} translate-allparts; \
|
||||
done
|
||||
|
||||
## create jobs for translating all parts
|
||||
## (only start the job if the file does not exist yet)
|
||||
.PHONY: translate-all-parts-jobs
|
||||
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
for p in ${PARTS}; do \
|
||||
if [ ! -e ${OUTPUT_DIR}/${WIKISOURCE}.$${p}_${MODELNAME}.${LANGPAIR}.${TRG}.gz ]; then \
|
||||
rm -f translate.${SUBMIT_PREFIX}; \
|
||||
${MAKE} PART=$$p translate.${SUBMIT_PREFIX}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
## create jobs for translating all parts of all wikis
|
||||
.PHONY: translate-all-jobs
|
||||
translate-all-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
for s in ${WIKISOURCES}; do \
|
||||
${MAKE} WIKISOURCE=$$s translate-all-parts-jobs; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: print-modelinfo
|
||||
print-modelinfo:
|
||||
@echo ${MODELNAME}
|
||||
@echo ${MODELZIP}
|
||||
@echo ${MODELINFO}
|
||||
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
|
||||
@echo "target language label: ${TARGET_LANG_LABEL}"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## fetch the latest model
|
||||
## ---> TODO: should we fetch from ObjectStorage instead?
|
||||
|
||||
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
wget -O ${dir $@}/model.zip ${MODELZIP}
|
||||
cd ${dir $@} && unzip model.zip
|
||||
rm -f ${dir $@}/model.zip
|
||||
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
|
||||
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
|
||||
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
|
||||
chmod +x ${dir $@}/preprocess.sh
|
||||
endif
|
||||
|
||||
|
||||
## pre-process data
|
||||
|
||||
ifeq (${MULTI_TARGET_MODEL},1)
|
||||
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
|
||||
else
|
||||
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
${WIKI_DIR}/${SRC}/%.${PART}.gz: | ${WIKI_DIR}/${SRC}/.done
|
||||
${GZCAT} ${@:.${PART}.gz=.txt.gz} |\
|
||||
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
|
||||
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
|
||||
rm -f ${@:.${PART}.gz=.txt.gz}
|
||||
|
||||
${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}/.done
|
||||
echo "done!"
|
||||
|
||||
|
||||
## NEW: get proper released WIKI data and extract the languages
|
||||
## --> multiple languages can be included in one release (like nno in nor)
|
||||
## --> shuffle the data as well
|
||||
|
||||
# fetch
|
||||
${WIKI_DIR}/${SRC}/data:
|
||||
mkdir -p ${dir $@}
|
||||
wget -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar
|
||||
tar -C ${dir $@} -xf $@.tar
|
||||
rm -f $@.tar
|
||||
|
||||
# de-duplicate and shuffle
|
||||
${WIKI_DIR}/${SRC}/${WIKISOURCE}.txt.gz:
|
||||
${MAKE} ${WIKI_DIR}/${SRC}/data
|
||||
for f in `find ${dir $@} -name '*.id.gz'`; do \
|
||||
t=`echo $$f | sed 's/\.id\.gz/.txt.gz/'`; \
|
||||
l=`echo ${SRC} | sed 's/cmn/zho/;s/nob/nor.*/'`; \
|
||||
paste <(${GZIP} -cd $$f) <(${GZIP} -cd $$t) |\
|
||||
grep "^$$l " | cut -f2 | grep . | \
|
||||
${UNIQ} | ${SHUFFLE} | ${GZIP} -c > ${dir $@}`basename $$t`; \
|
||||
done
|
||||
rm -fr ${WIKI_DIR}/${SRC}/data
|
||||
|
||||
# remove empty files
|
||||
${WIKI_DIR}/${SRC}/.done:
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${WIKI_DIR}/${SRC}/${WIKISOURCE}.txt.gz
|
||||
for f in `find ${dir $@} -name '*.txt.gz'`; do \
|
||||
if [ ! `${GZIP} -cd $$f | head | wc -l` -gt 0 ]; then \
|
||||
rm -f $$f; \
|
||||
fi \
|
||||
done
|
||||
touch $@
|
||||
|
||||
|
||||
|
||||
|
||||
## OLD: retrieve the old shuffled wiki release
|
||||
##
|
||||
|
||||
# ${WIKI_DIR}/${SRC}:
|
||||
# mkdir -p $@
|
||||
# wget -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar
|
||||
# tar -C ${dir $@} -xf $@.tar
|
||||
# if [ -d ${WIKI_DIR}/data/${SRC} ]; then \
|
||||
# mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\
|
||||
# rm -f ${WIKI_DIR}/data/${SRC}/*;\
|
||||
# rmdir ${WIKI_DIR}/data/${SRC};\
|
||||
# rmdir ${WIKI_DIR}/data;\
|
||||
# fi
|
||||
# if [ -e ${WIKI_DIR}/${SRC}/wiki.${SRC}.aa.gz ]; then \
|
||||
# for f in `ls ${WIKI_DIR}/${SRC}/*.${SRC}.*`; do \
|
||||
# mv $$f `echo $$f | sed 's/\.${SRC}\././'`; \
|
||||
# done \
|
||||
# fi
|
||||
# rm -f $@.tar
|
||||
|
||||
|
||||
|
||||
${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/${SRC}/%.${PART}.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${GZCAT} $< |\
|
||||
grep -v '[<>{}]' |\
|
||||
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
||||
gzip -f > $@
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## merge SentencePiece segments in the source text
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${WIKI_SRC}: ${WIKI_PRE}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
gzip -c > $@
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
## overwrite the file with the latest translations
|
||||
## --> this allows multiple translation iterations
|
||||
## without duplicating the data we want to use in MT training
|
||||
|
||||
${WIKI_LATEST_SRC}: ${WIKI_SRC}
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${WIKI_LATEST_TRG}: ${WIKI_TRG}
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${WIKI_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
|
||||
## translate
|
||||
|
||||
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
|
||||
-i ${PWD}/$< \
|
||||
-c decoder.yml \
|
||||
-d ${MARIAN_GPUS} \
|
||||
--quiet-translation \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > ${PWD}/$@
|
||||
#ifneq (${LANGPAIR},)
|
||||
#ifneq (${MODELNAME},)
|
||||
# rm -fr ${LANGPAIR}/${MODELNAME}
|
||||
#endif
|
||||
#endif
|
||||
endif
|
||||
|
||||
|
||||
check-latest:
|
||||
@if [ -d ${LANGPAIR}/latest ]; then \
|
||||
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
else \
|
||||
echo "$$a $$S $$T"; \
|
||||
fi \
|
||||
done \
|
||||
fi
|
||||
|
||||
check-translated:
|
||||
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
else \
|
||||
echo "$$a $$S $$T"; \
|
||||
fi \
|
||||
done
|
||||
|
||||
check-length:
|
||||
@echo "check ${LANGPAIR}"
|
||||
@${MAKE} check-translated
|
||||
@${MAKE} check-latest
|
||||
|
||||
|
||||
remove-%-all check-%-all:
|
||||
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
||||
s=`echo $$d | cut -f1 -d'-'`; \
|
||||
t=`echo $$d | cut -f2 -d'-'`; \
|
||||
make SRC=$$s TRG=$$t ${@:-all=}; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
remove-incomplete:
|
||||
${MAKE} remove-incomplete-translated
|
||||
${MAKE} remove-incomplete-latest
|
||||
|
||||
remove-incomplete-translated:
|
||||
@echo "check ${LANGPAIR}"
|
||||
@mkdir -p ${LANGPAIR}/incomplete
|
||||
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
mv $$S ${LANGPAIR}/incomplete/; \
|
||||
mv $$T ${LANGPAIR}/incomplete/; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
remove-incomplete-latest:
|
||||
@echo "check ${LANGPAIR}"
|
||||
@mkdir -p ${LANGPAIR}/incomplete/latest
|
||||
@if [ -d ${LANGPAIR}/latest ]; then \
|
||||
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
mv $$S ${LANGPAIR}/incomplete/latest/; \
|
||||
mv $$T ${LANGPAIR}/incomplete/latest/; \
|
||||
fi \
|
||||
done \
|
||||
fi
|
||||
|
313
tatoeba/forward-translate/Makefile
Normal file
313
tatoeba/forward-translate/Makefile
Normal file
@ -0,0 +1,313 @@
|
||||
#
|
||||
# forward translation to be used for
|
||||
# knowledge distillation
|
||||
#
|
||||
# only works with sentencepiece models!
|
||||
#
|
||||
# TODO's
|
||||
#
|
||||
# - forward-translate monolingual data (re-use bt-data)
|
||||
# - reconstruction filtering (score translation in opposite direction)
|
||||
# (use weights? normalise-script from bergamot/students)
|
||||
# - other kind of data filtering / selection?
|
||||
# - create lexical shortlists (see bergamot)
|
||||
# - finetune alphas in intgemm8 models (see bergamot)
|
||||
# - benchmark distilled models
|
||||
#
|
||||
|
||||
PWD := ${shell pwd}
|
||||
REPOHOME := ${PWD}/../../
|
||||
|
||||
include ${REPOHOME}lib/env.mk
|
||||
include ${REPOHOME}lib/config.mk
|
||||
include ${REPOHOME}lib/slurm.mk
|
||||
|
||||
|
||||
SRC = fin
|
||||
TRG = eng
|
||||
|
||||
|
||||
## change decoder settings
|
||||
## TODO: do we need this?
|
||||
|
||||
MARIAN_BEAM_SIZE=1
|
||||
MARIAN_MINI_BATCH=100
|
||||
MARIAN_MAXI_BATCH=100
|
||||
MARIAN_MAX_LENGTH=200
|
||||
MARIAN_WORKSPACE=12000
|
||||
|
||||
|
||||
TATOEBA_VERSION ?= v2021-08-07
|
||||
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
|
||||
|
||||
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
|
||||
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
|
||||
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
|
||||
## container for storing backtranslations
|
||||
BT_CONTAINER = Tatoeba-MT-bt
|
||||
BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
|
||||
|
||||
## split size in nr-of-lines
|
||||
## default part to be selected = aa
|
||||
SPLIT_SIZE ?= 1000000
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
## maximum number of sentences to be translated (top N lines)
|
||||
MAX_LENGTH ?= 200
|
||||
MAX_SENTENCES ?= ${SPLIT_SIZE}
|
||||
|
||||
SORTLANGS = $(sort ${SRC} ${TRG})
|
||||
LANGPAIR = ${SRC}-${TRG}
|
||||
SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
||||
|
||||
PWD := $(shell pwd)
|
||||
|
||||
|
||||
|
||||
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
|
||||
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
|
||||
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
|
||||
ifneq (${MULTI_TARGET_MODEL},0)
|
||||
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
|
||||
endif
|
||||
|
||||
RELEASED_BITEXTS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
|
||||
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
|
||||
|
||||
RELEASED_BITEXTS_REV = ${shell (for d in ${RELEASED_BITEXTS}; do echo $$d; done) | tac}
|
||||
|
||||
|
||||
PART ?= aa
|
||||
OUTPUT_DIR ?= ${LANGPAIR}
|
||||
|
||||
BITEXT_DATADIR = ${PWD}/../work/data/simple
|
||||
MODEL_WORKDIR = ${PWD}/../work/${LANGPAIR}
|
||||
BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${SRC}.gz
|
||||
BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz}
|
||||
|
||||
BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${LANGPAIR}
|
||||
BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz
|
||||
BITEXT_PRE = ${BITEXT_BASE}.${SRC}.spm.${PART}.gz
|
||||
BITEXT_TRG = ${BITEXT_BASE}.${TRG}.${PART}.gz
|
||||
|
||||
BITEXT_LATEST_SRC = ${OUTPUT_DIR}/latest/Tatoeba-train.${PART}.${LANGPAIR}.${SRC}.gz
|
||||
BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PART}.${LANGPAIR}.${TRG}.gz
|
||||
BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
|
||||
|
||||
|
||||
## all parts of the bitext
|
||||
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
|
||||
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
|
||||
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
|
||||
|
||||
|
||||
## don't delete translated text even if the process crashes
|
||||
.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
|
||||
.PHONY: all
|
||||
all: translate
|
||||
|
||||
.PHONY: prepare
|
||||
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${BITEXT_PRE}
|
||||
|
||||
.PHONY: translate
|
||||
translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
|
||||
${MAKE} ${BITEXT_LATEST_SRC}
|
||||
|
||||
## translate all parts
|
||||
.PHONY: translate-all-parts
|
||||
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
|
||||
${MAKE} source-all-parts
|
||||
|
||||
.PHONY: source-all-parts
|
||||
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
|
||||
|
||||
|
||||
.PHONY: print-modelinfo
|
||||
print-modelinfo:
|
||||
@echo ${MODELNAME}
|
||||
@echo ${MODELZIP}
|
||||
@echo ${MODELINFO}
|
||||
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
|
||||
@echo "target language label: ${TARGET_LANG_LABEL}"
|
||||
|
||||
## fetch the latest model
|
||||
|
||||
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
wget -O ${dir $@}/model.zip ${MODELZIP}
|
||||
cd ${dir $@} && unzip model.zip
|
||||
rm -f ${dir $@}/model.zip
|
||||
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
|
||||
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
|
||||
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
|
||||
chmod +x ${dir $@}/preprocess.sh
|
||||
endif
|
||||
|
||||
|
||||
## pre-process data
|
||||
|
||||
ifeq (${MULTI_TARGET_MODEL},1)
|
||||
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
|
||||
else
|
||||
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
|
||||
endif
|
||||
|
||||
|
||||
ifeq (${BITEXT_SRCPRE},)
|
||||
|
||||
${BITEXT_SRCRAW}:
|
||||
${MAKE} -C .. SRCLANGS=${SRC} TRGLANGS=${TRG} clean-data-tatoeba
|
||||
|
||||
else
|
||||
|
||||
${BITEXT_SRCRAW}: ${BITEXT_SRCPRE}
|
||||
sed 's/ //g;s/▁/ /g' < $< | sed 's/^ *//;s/ *$$//' | ${GZIP} -f > $@
|
||||
|
||||
endif
|
||||
|
||||
|
||||
${BITEXT_PRE}: ${BITEXT_SRCRAW}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${GZCAT} $< |\
|
||||
grep -v '[<>{}]' |\
|
||||
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
||||
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
|
||||
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
|
||||
endif
|
||||
|
||||
|
||||
## merge SentencePiece segments in the source text
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \
|
||||
mkdir -p ${dir $@}; \
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
gzip -c > $@; \
|
||||
fi
|
||||
|
||||
|
||||
## overwrite the file with the latest translations
|
||||
## --> this allows multiple translation iterations
|
||||
## without duplicating the data we want to use in MT training
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${BITEXT_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
|
||||
## translate
|
||||
|
||||
${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && \
|
||||
${MARIAN_DECODER} \
|
||||
-c decoder.yml \
|
||||
-i ${PWD}/$< \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > ${PWD}/$@
|
||||
endif
|
||||
|
||||
|
||||
|
||||
check-latest:
|
||||
@if [ -d ${LANGPAIR}/latest ]; then \
|
||||
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
else \
|
||||
echo "$$a $$S $$T"; \
|
||||
fi \
|
||||
done \
|
||||
fi
|
||||
|
||||
check-translated:
|
||||
@for S in `ls ${LANGPAIR}/*.${SRC}.spm.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
else \
|
||||
echo "$$a $$S $$T"; \
|
||||
fi \
|
||||
done
|
||||
|
||||
check-length:
|
||||
@echo "check ${LANGPAIR}"
|
||||
@${MAKE} check-translated
|
||||
@${MAKE} check-latest
|
||||
|
||||
|
||||
remove-%-all check-%-all:
|
||||
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
||||
s=`echo $$d | cut -f1 -d'-'`; \
|
||||
t=`echo $$d | cut -f2 -d'-'`; \
|
||||
make SRC=$$s TRG=$$t ${@:-all=}; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
remove-incomplete:
|
||||
${MAKE} remove-incomplete-translated
|
||||
${MAKE} remove-incomplete-latest
|
||||
|
||||
remove-incomplete-translated:
|
||||
@echo "check ${LANGPAIR}"
|
||||
@mkdir -p ${LANGPAIR}/incomplete
|
||||
@for S in `ls ${LANGPAIR}/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
mv $$S ${LANGPAIR}/incomplete/; \
|
||||
mv $$T ${LANGPAIR}/incomplete/; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
remove-incomplete-latest:
|
||||
@echo "check ${LANGPAIR}"
|
||||
@mkdir -p ${LANGPAIR}/incomplete/latest
|
||||
@if [ -d ${LANGPAIR}/latest ]; then \
|
||||
for S in `ls ${LANGPAIR}/latest/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
mv $$S ${LANGPAIR}/incomplete/latest/; \
|
||||
mv $$T ${LANGPAIR}/incomplete/latest/; \
|
||||
fi \
|
||||
done \
|
||||
fi
|
||||
|
0
tatoeba/langids-train-only-v2020-07-28.txt
Normal file
0
tatoeba/langids-train-only-v2020-07-28.txt
Normal file
1314
tatoeba/langids-train-only-v2021-08-07.txt
Normal file
1314
tatoeba/langids-train-only-v2021-08-07.txt
Normal file
File diff suppressed because it is too large
Load Diff
302
tatoeba/pivoting/Makefile
Normal file
302
tatoeba/pivoting/Makefile
Normal file
@ -0,0 +1,302 @@
|
||||
#
|
||||
# translate PIVOT language into SRC language
|
||||
# to make a synthetic SRC-TRG corpus from another
|
||||
# PIVOT-TRG corpus
|
||||
|
||||
|
||||
PWD := ${shell pwd}
|
||||
REPOHOME := ${PWD}/../../
|
||||
|
||||
include ${REPOHOME}lib/env.mk
|
||||
include ${REPOHOME}lib/config.mk
|
||||
include ${REPOHOME}lib/slurm.mk
|
||||
|
||||
|
||||
SRC = swe
|
||||
TRG = fin
|
||||
PIVOT = eng
|
||||
|
||||
|
||||
## change decoder settings
|
||||
## TODO: do we need this?
|
||||
|
||||
MARIAN_BEAM_SIZE=1
|
||||
MARIAN_MINI_BATCH=100
|
||||
MARIAN_MAXI_BATCH=100
|
||||
MARIAN_MAX_LENGTH=200
|
||||
MARIAN_WORKSPACE=12000
|
||||
|
||||
|
||||
TATOEBA_VERSION ?= v2021-08-07
|
||||
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
|
||||
|
||||
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
|
||||
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
|
||||
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
|
||||
## container for storing backtranslations
|
||||
BT_CONTAINER = Tatoeba-MT-bt
|
||||
BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
|
||||
|
||||
## split size in nr-of-lines
|
||||
## default part to be selected = aa
|
||||
SPLIT_SIZE ?= 1000000
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
## maximum number of sentences to be translated (top N lines)
|
||||
MAX_LENGTH ?= 200
|
||||
MAX_SENTENCES ?= ${SPLIT_SIZE}
|
||||
|
||||
TRANSLATE_LANGPAIR = ${PIVOT}-${SRC}
|
||||
ORIGINAL_LANGPAIR = ${PIVOT}-${TRG}
|
||||
NEW_LANGPAIR = ${SRC}-${TRG}
|
||||
|
||||
SORTLANGS = $(sort ${PIVOT} ${TRG})
|
||||
SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
||||
|
||||
PART ?= aa
|
||||
OUTPUT_DIR ?= ${NEW_LANGPAIR}
|
||||
|
||||
|
||||
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
|
||||
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4}
|
||||
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
MODELDIR = ${OUTPUT_DIR}/${TRANSLATE_LANGPAIR}/${MODELNAME}
|
||||
|
||||
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
|
||||
ifneq (${MULTI_TARGET_MODEL},0)
|
||||
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
BITEXT_DATADIR = ${PWD}/../work/data/simple
|
||||
MODEL_WORKDIR = ${PWD}/../work/${PIVOT}-${TRG}
|
||||
BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${PIVOT}.gz
|
||||
BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz}
|
||||
|
||||
BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${PIVOT}-${SRC}-${TRG}
|
||||
BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz
|
||||
BITEXT_PRE = ${BITEXT_BASE}.${SRC}.spm.${PART}.gz
|
||||
BITEXT_TRG = ${BITEXT_BASE}.${TRG}.${PART}.gz
|
||||
|
||||
BITEXT_LATEST_SRC = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.${PART}.${SRC}.gz
|
||||
BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.${PART}.${TRG}.gz
|
||||
BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
|
||||
|
||||
|
||||
## all parts of the bitext
|
||||
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
|
||||
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz,${PARTS}}
|
||||
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz,${PARTS}}
|
||||
|
||||
|
||||
## don't delete translated text even if the process crashes
|
||||
.PRECIOUS: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
|
||||
.PHONY: all
|
||||
all: translate
|
||||
|
||||
.PHONY: prepare
|
||||
prepare: ${MODELDIR}/decoder.yml ${BITEXT_PRE}
|
||||
|
||||
.PHONY: translate
|
||||
translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
|
||||
${MAKE} ${BITEXT_LATEST_SRC}
|
||||
|
||||
## translate all parts
|
||||
.PHONY: translate-all-parts
|
||||
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
|
||||
${MAKE} source-all-parts
|
||||
|
||||
.PHONY: source-all-parts
|
||||
source-all-parts: ${ALL_BITEXT_LATEST_SRC}
|
||||
|
||||
|
||||
.PHONY: print-modelinfo
|
||||
print-modelinfo:
|
||||
@echo ${MODELNAME}
|
||||
@echo ${MODELZIP}
|
||||
@echo ${MODELINFO}
|
||||
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
|
||||
@echo "target language label: ${TARGET_LANG_LABEL}"
|
||||
|
||||
## fetch the latest model
|
||||
|
||||
${MODELDIR}/decoder.yml:
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
wget -O ${dir $@}/model.zip ${MODELZIP}
|
||||
cd ${dir $@} && unzip model.zip
|
||||
rm -f ${dir $@}/model.zip
|
||||
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
|
||||
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
|
||||
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
|
||||
chmod +x ${dir $@}/preprocess.sh
|
||||
endif
|
||||
|
||||
|
||||
## pre-process data
|
||||
|
||||
ifeq (${MULTI_TARGET_MODEL},1)
|
||||
PREPROCESS_ARGS = ${SRC} ${TRG} ${MODELDIR}/source.spm
|
||||
else
|
||||
PREPROCESS_ARGS = ${SRC} ${MODELDIR}/source.spm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
ifeq (${BITEXT_SRCPRE},)
|
||||
|
||||
${BITEXT_SRCRAW}:
|
||||
${MAKE} -C .. SRCLANGS=${PIVOT} TRGLANGS=${TRG} clean-data
|
||||
|
||||
else
|
||||
|
||||
${BITEXT_SRCRAW}: ${BITEXT_SRCPRE}
|
||||
sed 's/ //g;s/▁/ /g' < $< | sed 's/^ *//;s/ *$$//' | ${GZIP} -f > $@
|
||||
|
||||
endif
|
||||
|
||||
|
||||
${BITEXT_PRE}: ${BITEXT_SRCRAW}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${MODELDIR}/decoder.yml
|
||||
${GZCAT} $< |\
|
||||
grep -v '[<>{}]' |\
|
||||
${MODELDIR}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
||||
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
|
||||
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## merge SentencePiece segments in the source text
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
if [ -e ${patsubst ${BITEXT_BASE}.${SRC}.%.gz,${BITEXT_BASE}.${TRG}.%.gz,$@} ]; then \
|
||||
mkdir -p ${dir $@}; \
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
gzip -c > $@; \
|
||||
fi
|
||||
|
||||
|
||||
## overwrite the file with the latest translations
|
||||
## --> this allows multiple translation iterations
|
||||
## without duplicating the data we want to use in MT training
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${BITEXT_LATEST_README}: ${MODELDIR}/README.md
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
|
||||
## translate
|
||||
|
||||
${BITEXT_BASE}.${TRG}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${MODELDIR}/decoder.yml
|
||||
${LOAD_ENV} && cd ${MODELDIR} && \
|
||||
${MARIAN_DECODER} \
|
||||
-c decoder.yml \
|
||||
-i ${PWD}/$< \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > ${PWD}/$@
|
||||
endif
|
||||
|
||||
|
||||
|
||||
check-latest:
|
||||
@if [ -d ${OUTPUT_DIR}/latest ]; then \
|
||||
for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
else \
|
||||
echo "$$a $$S $$T"; \
|
||||
fi \
|
||||
done \
|
||||
fi
|
||||
|
||||
check-translated:
|
||||
@for S in `ls ${OUTPUT_DIR}/*.${SRC}.spm.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
else \
|
||||
echo "$$a $$S $$T"; \
|
||||
fi \
|
||||
done
|
||||
|
||||
check-length:
|
||||
@echo "check ${OUTPUT_DIR}"
|
||||
@${MAKE} check-translated
|
||||
@${MAKE} check-latest
|
||||
|
||||
|
||||
remove-%-all check-%-all:
|
||||
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
||||
s=`echo $$d | cut -f1 -d'-'`; \
|
||||
t=`echo $$d | cut -f2 -d'-'`; \
|
||||
make SRC=$$s TRG=$$t ${@:-all=}; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
remove-incomplete:
|
||||
${MAKE} remove-incomplete-translated
|
||||
${MAKE} remove-incomplete-latest
|
||||
|
||||
remove-incomplete-translated:
|
||||
@echo "check ${OUTPUT_DIR}"
|
||||
@mkdir -p ${OUTPUT_DIR}/incomplete
|
||||
@for S in `ls ${OUTPUT_DIR}/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
mv $$S ${OUTPUT_DIR}/incomplete/; \
|
||||
mv $$T ${OUTPUT_DIR}/incomplete/; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
remove-incomplete-latest:
|
||||
@echo "check ${OUTPUT_DIR}"
|
||||
@mkdir -p ${OUTPUT_DIR}/incomplete/latest
|
||||
@if [ -d ${OUTPUT_DIR}/latest ]; then \
|
||||
for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \
|
||||
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
|
||||
a=`${GZCAT} $$S | wc -l`; \
|
||||
b=`${GZCAT} $$T | wc -l`; \
|
||||
if [ $$a != $$b ]; then \
|
||||
echo "$$a != $$b $$S $$T"; \
|
||||
mv $$S ${OUTPUT_DIR}/incomplete/latest/; \
|
||||
mv $$T ${OUTPUT_DIR}/incomplete/latest/; \
|
||||
fi \
|
||||
done \
|
||||
fi
|
||||
|
Loading…
Reference in New Issue
Block a user