removed old makefiles

This commit is contained in:
Joerg Tiedemann 2020-05-03 21:56:08 +03:00
parent 5404f515aa
commit 79cc3d66f0
10 changed files with 0 additions and 4100 deletions

View File

@ -1,403 +0,0 @@
# -*-makefile-*-
#
# model configurations
#
# SRCLANGS = da no sv
# TRGLANGS = fi
SRCLANGS = sv
TRGLANGS = fi
ifndef SRC
SRC := ${firstword ${SRCLANGS}}
endif
ifndef TRG
TRG := ${lastword ${TRGLANGS}}
endif
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
SPACE = $(empty) $(empty)
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
## for monolingual things
ifndef LANGS
LANGS := ${SRCLANGS}
endif
ifndef LANGID
LANGID := ${firstword ${LANGS}}
endif
ifndef LANGSTR
LANGSTR = ${subst ${SPACE},+,$(LANGS)}
endif
## for same language pairs: add numeric extension
ifeq (${SRC},$(TRG))
SRCEXT = ${SRC}1
TRGEXT = ${SRC}2
else
SRCEXT = ${SRC}
TRGEXT = ${TRG}
endif
## set additional argument options for opus_read (if it is used)
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
OPUSREAD_ARGS =
## all of OPUS (NEW: don't require MOSES format)
# OPUSCORPORA = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\
# ${patsubst ${OPUSHOME}/%,%,\
# ${shell ls ${OPUSHOME}/*/latest/moses/${LANGPAIR}.txt.zip}}}
OPUSCORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
${patsubst ${OPUSHOME}/%,%,\
${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}}
## monolingual data
OPUSMONOCORPORA = ${patsubst %/latest/mono/${LANGID}.txt.gz,%,\
${patsubst ${OPUSHOME}/%,%,\
${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}}
ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
ALL_MULTILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}
# ALL_BILINGUAL_MODELS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old | grep -v -- '\+'}
# ALL_MULTILINGUAL_MODELS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old | grep -- '\+'}
## size of dev data, test data and BPE merge operations
## NEW default size = 2500 (keep more for training for small languages)
DEVSIZE = 2500
TESTSIZE = 2500
## NEW: significantly reduce devminsize
## (= absolute minimum we need as devdata)
## NEW: define an alternative small size for DEV and TEST
## OLD DEVMINSIZE:
# DEVMINSIZE = 1000
DEVSMALLSIZE = 1000
TESTSMALLSIZE = 1000
DEVMINSIZE = 250
## size of heldout data for each sub-corpus
## (only if there is at least twice as many examples in the corpus)
HELDOUTSIZE = ${DEVSIZE}
##----------------------------------------------------------------------------
## train/dev/test data
##----------------------------------------------------------------------------
## dev/test data: default = Tatoeba otherwise, GlobalVoices, JW300, GNOME or bibl-uedin
## - check that data exist
## - check that there are at least 2 x DEVMINSIZE examples
## TODO: this does not work well for multilingual models!
ifneq ($(wildcard ${OPUSHOME}/Tatoeba/latest/moses/${LANGPAIR}.txt.zip),)
ifeq ($(shell if (( `head -1 ${OPUSHOME}/Tatoeba/latest/info/${LANGPAIR}.txt.info` \
> $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
DEVSET = Tatoeba
endif
endif
## backoff to GlobalVoices
ifndef DEVSET
ifneq ($(wildcard ${OPUSHOME}/GlobalVoices/latest/moses/${LANGPAIR}.txt.zip),)
ifeq ($(shell if (( `head -1 ${OPUSHOME}/GlobalVoices/latest/info/${LANGPAIR}.txt.info` \
> $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
DEVSET = GlobalVoices
endif
endif
endif
## backoff to infopankki
ifndef DEVSET
ifneq ($(wildcard ${OPUSHOME}/infopankki/latest/moses/${LANGPAIR}.txt.zip),)
ifeq ($(shell if (( `head -1 ${OPUSHOME}/infopankki/latest/info/${LANGPAIR}.txt.info` \
> $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
DEVSET = infopankki
endif
endif
endif
## backoff to JW300
ifndef DEVSET
ifneq ($(wildcard ${OPUSHOME}/JW300/latest/xml/${LANGPAIR}.xml.gz),)
ifeq ($(shell if (( `sed -n 2p ${OPUSHOME}/JW300/latest/info/${LANGPAIR}.info` \
> $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
DEVSET = JW300
endif
endif
endif
## otherwise: bible-uedin
ifndef DEVSET
DEVSET = bible-uedin
endif
## increase dev/test sets for Tatoeba (very short sentences!)
ifeq (${DEVSET},Tatoeba)
DEVSIZE = 5000
TESTSIZE = 5000
endif
## in case we want to use some additional data sets
EXTRA_TRAINSET =
## TESTSET= DEVSET, TRAINSET = OPUS - WMT-News,DEVSET.TESTSET
TESTSET = ${DEVSET}
TRAINSET = $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TUNESET = OpenSubtitles
MONOSET = $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
## 1 = use remaining data from dev/test data for training
USE_REST_DEVDATA = 1
##----------------------------------------------------------------------------
## pre-processing and vocabulary
##----------------------------------------------------------------------------
BPESIZE = 32000
SRCBPESIZE = ${BPESIZE}
TRGBPESIZE = ${BPESIZE}
ifndef VOCABSIZE
VOCABSIZE = $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
endif
## for document-level models
CONTEXT_SIZE = 100
## pre-processing type
# PRE = norm
PRE = simple
PRE_SRC = spm${SRCBPESIZE:000=}k
PRE_TRG = spm${TRGBPESIZE:000=}k
##-------------------------------------
## default name of the data set (and the model)
##-------------------------------------
ifndef DATASET
DATASET = opus
endif
ifndef BPEMODELNAME
BPEMODELNAME = opus
endif
##-------------------------------------
## OLD OLD OLD
## name of the data set (and the model)
## - single corpus = use that name
## - multiple corpora = opus
## add also vocab size to the name
##-------------------------------------
ifndef OLDDATASET
ifeq (${words ${TRAINSET}},1)
OLDDATASET = ${TRAINSET}
else
OLDDATASET = opus
endif
endif
## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training
DATADIR = ${WORKHOME}/data
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
SPMDIR = ${WORKHOME}/SentencePieceModels
## data sets
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
TRAIN_SRC = ${TRAIN_BASE}.src
TRAIN_TRG = ${TRAIN_BASE}.trg
TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz
## training data in local space
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg
DEV_SRC = ${WORKDIR}/val/${DEVSET}.src
DEV_TRG = ${WORKDIR}/val/${DEVSET}.trg
TEST_SRC = ${WORKDIR}/test/${TESTSET}.src
TEST_TRG = ${WORKDIR}/test/${TESTSET}.trg
## heldout data directory (keep one set per data set)
HELDOUT_DIR = ${WORKDIR}/heldout
MODEL_SUBDIR =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODELTYPE = transformer-align
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_VOCABTYPE = yml
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
## test set translation and scores
TEST_TRANSLATION = ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
TEST_EVALUATION = ${TEST_TRANSLATION}.eval
TEST_COMPARISON = ${TEST_TRANSLATION}.compare
## parameters for running Marian NMT
MARIAN_GPUS = 0
MARIAN_EXTRA =
MARIAN_VALID_FREQ = 10000
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_EARLY_STOPPING = 10
MARIAN_VALID_MINI_BATCH = 16
MARIAN_MAXI_BATCH = 500
MARIAN_DROPOUT = 0.1
MARIAN_MAX_LENGTH = 500
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
## TODO: currently marianNMT crashes with workspace > 26000
ifeq (${GPU},p100)
MARIAN_WORKSPACE = 13000
else ifeq (${GPU},v100)
# MARIAN_WORKSPACE = 30000
# MARIAN_WORKSPACE = 26000
MARIAN_WORKSPACE = 24000
# MARIAN_WORKSPACE = 18000
# MARIAN_WORKSPACE = 16000
else
MARIAN_WORKSPACE = 10000
endif
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
MARIAN = ${MARIANCPU}
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
endif
ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
endif
### training a model with Marian NMT
##
## NR allows to train several models for proper ensembling
## (with shared vocab)
##
## DANGER: if several models are started at the same time
## then there is some racing issue with creating the vocab!
ifdef NR
SEED=${NR}${NR}${NR}${NR}
else
SEED=1234
endif
## load model-specific configuration parameters
ifneq ($(wildcard ${WORKDIR}/config),)
include ${WORKDIR}/config
endif
## make some data size-specific configuration parameters
local-config: ${WORKDIR}/config
${WORKDIR}/config:
mkdir -p ${dir $@}
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
else \
${MAKE} ${LOCAL_TRAIN_SRC}; \
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
fi; \
if [ $$s -gt 10000000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 10 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = -multipu" >> $@; \
elif [ $$s -gt 1000000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 1 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
elif [ $$s -gt 500000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 500k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "MARIAN_WORKSPACE = 10000" >> $@; \
echo "BPESIZE = 12000" >> $@; \
elif [ $$s -gt 100000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
echo "MARIAN_WORKSPACE = 5000" >> $@; \
echo "MARIAN_VALID_MINI_BATCH = 8" >> $@; \
echo "HELDOUTSIZE = 0" >> $@; \
echo "BPESIZE = 4000" >> $@; \
echo "DEVSIZE = 1000" >> $@; \
echo "TESTSIZE = 1000" >> $@; \
echo "DEVMINSIZE = 250" >> $@; \
elif [ $$s -gt 10000 ]; then \
echo "# ${LANGPAIRSTR} bigger than 10k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
echo "MARIAN_WORKSPACE = 3500" >> $@; \
echo "MARIAN_DROPOUT = 0.5" >> $@; \
echo "MARIAN_VALID_MINI_BATCH = 4" >> $@; \
echo "HELDOUTSIZE = 0" >> $@; \
echo "BPESIZE = 1000" >> $@; \
echo "DEVSIZE = 500" >> $@; \
echo "TESTSIZE = 1000" >> $@; \
echo "DEVMINSIZE = 100" >> $@; \
else \
echo "${LANGPAIRSTR} too small"; \
fi

File diff suppressed because it is too large Load Diff

View File

@ -1,115 +0,0 @@
# -*-makefile-*-
# enable e-mail notification by setting EMAIL
WHOAMI = $(shell whoami)
ifeq ("$(WHOAMI)","tiedeman")
EMAIL = jorg.tiedemann@helsinki.fi
endif
# job-specific settings (overwrite if necessary)
# HPC_EXTRA: additional SBATCH commands
CPU_MODULES = gcc/6.2.0 mkl
GPU_MODULES = cuda-env/8 mkl
# GPU_MODULES = python-env/3.5.3-ml cuda-env/8 mkl
# GPU = k80
GPU = p100
NR_GPUS = 1
HPC_MEM = 8g
HPC_NODES = 1
HPC_CORES = 1
HPC_DISK = 500
HPC_QUEUE = serial
# HPC_MODULES = nlpl-opus python-env/3.4.1 efmaral moses
# HPC_MODULES = nlpl-opus moses cuda-env marian python-3.5.3-ml
HPC_MODULES = ${GPU_MODULES}
HPC_EXTRA =
WALLTIME = 72
DEVICE = cuda
LOADCPU = module load ${CPU_MODULES}
LOADGPU = module load ${GPU_MODULES}
MARIAN_WORKSPACE = 13000
ifeq (${shell hostname},dx6-ibs-p2)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
MARIAN_WORKSPACE = 10000
else ifeq (${shell hostname},dx7-nkiel-4gpu)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
MARIAN_WORKSPACE = 10000
else ifneq ($(wildcard /wrk/tiedeman/research/),)
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
# APPLHOME = ${USERAPPL}/tools
APPLHOME = /proj/memad/tools
WORKHOME = /wrk/tiedeman/research/marian/${SRC}-${TRG}
OPUSHOME = /proj/nlpl/data/OPUS
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
# MARIAN = /proj/nlpl/software/marian/1.2.0
# MARIAN = /appl/ling/marian
MARIAN = ${HOME}/appl_taito/tools/marian/build-gpu
MARIANCPU = ${HOME}/appl_taito/tools/marian/build-cpu
LOADMODS = ${LOADGPU}
else
# CSCPROJECT = project_2001194
CSCPROJECT = project_2000309
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
WORKHOME = ${shell realpath ${PWD}/work}
APPLHOME = ${HOME}/projappl
OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
MOSESHOME = ${APPLHOME}/mosesdecoder
EFLOMAL_HOME = ${APPLHOME}/eflomal/
MARIAN = ${APPLHOME}/marian/build
MARIANCPU = ${APPLHOME}/marian/build
# GPU_MODULES = cuda intel-mkl
GPU = v100
GPU_MODULES = python-env
CPU_MODULES = python-env
LOADMODS = echo "nothing to load"
HPC_QUEUE = small
MARIAN_WORKSPACE = 30000
endif
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
WORDALIGN = ${EFLOMAL_HOME}align.py
ATOOLS = ${FASTALIGN_HOME}atools
MULTEVALHOME = ${APPLHOME}/multeval
MOSESSCRIPTS = ${MOSESHOME}/scripts
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
SNMTPATH = ${APPLHOME}/subword-nmt/subword_nmt
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
## for same language pairs: add numeric extension
ifeq (${SRC},$(TRG))
SRCEXT = ${SRC}1
TRGEXT = ${SRC}2
else
SRCEXT = ${SRC}
TRGEXT = ${TRG}
endif

View File

@ -1,474 +0,0 @@
# -*-makefile-*-
#
# make distribution packages
# and upload them to cPouta ObjectStorage
#
MODELSHOME = ${WORKHOME}/models
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
## minimum BLEU score for models to be accepted as distribution package
MIN_BLEU_SCORE = 20
.PHONY: dist
dist: ${DIST_PACKAGE}
.PHONY: scores
scores:
${MAKE} FIND_EVAL_FILES=1 ${WORKHOME}/eval/scores.txt
## get the best model from all kind of alternative setups
## in the following sub directories (add prefix work-)
ALT_MODEL_BASE = work-
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
# ALT_MODEL_DIR = spm langid
ALT_MODEL_DIR = langid
best_dist_all:
for l in $(sort ${shell ls ${ALT_MODEL_BASE}* | grep -- '-' | grep -v old | grep -v work}); do \
if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" \
DATASET=$$d best_dist; \
fi \
done
# best_dist_all:
# for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
# if [ `find work*/$$l -name '${DATASET}${TRAINSIZE}.*.npz' | wc -l` -gt 0 ]; then \
# ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
# TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" best_dist; \
# fi \
# done
## find the best model according to test set scores
## and make a distribution package from that model
## (BLEU needs to be above MIN_BLEU_SCORE)
## NEW: don't trust models tested with GNOME test sets!
## OLD version of finding the best model
## --> this didn't properly look at different variants in the same folder
best_dist_old:
@m=0;\
s=''; \
echo "------------------------------------------------"; \
echo "search best model for ${LANGPAIRSTR}"; \
for d in ${ALT_MODEL_DIR}; do \
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
echo "evaldata = $$e"; \
if [ "$$e" != "GNOME" ]; then \
if ls work-$$d/${LANGPAIRSTR}/$$e*.eval 1> /dev/null 2>&1; then \
b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e*.eval | cut -f3 -d' '`; \
if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \
echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \
m=$$b; \
s=$$d; \
else \
echo "$$d ($$b) is worse than $$s ($$m)!"; \
fi \
fi \
fi \
done; \
echo "------------------------------------------------"; \
if [ "$$s" != "" ]; then \
if (( $$(echo "$$m > ${MIN_BLEU_SCORE}" |bc -l) )); then \
${MAKE} MODELSHOME=${PWD}/models \
MODELS_URL=https://object.pouta.csc.fi/OPUS-MT-models dist-$$s; \
fi; \
fi
## new version of finding the best model
## --> look at different model variants in each work-dir
## --> take only the best one to publish
best_dist:
@m=0;\
s=''; \
echo "------------------------------------------------"; \
echo "search best model for ${LANGPAIRSTR}"; \
for d in ${ALT_MODEL_DIR}; do \
e=`ls work-$$d/${LANGPAIRSTR}/test/*.trg | tail -1 | xargs basename | sed 's/\.trg//'`; \
echo "evaldata = $$e"; \
if [ "$$e" != "GNOME" ]; then \
I=`find work-$$d/${LANGPAIRSTR}/ -maxdepth 1 -name "$$e.*.eval" -printf "%f\n"`; \
for i in $$I; do \
x=`echo $$i | cut -f3 -d. | cut -f1 -d-`; \
y=`echo $$i | cut -f3 -d. | cut -f2 -d- | sed 's/[0-9]$$//'`; \
z=`echo $$i | cut -f2 -d.`; \
v=`echo $$i | cut -f4 -d.`; \
b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e.$$z.$$x-$$y[0-9].$$v.*.eval | cut -f3 -d' '`; \
if (( $$(echo "$$m-$$b < 0" |bc -l) )); then \
echo "$$d/$$i ($$b) is better than $$s ($$m)!"; \
m=$$b; \
E=$$i; \
s=$$d; \
else \
echo "$$d/$$i ($$b) is worse than $$s ($$m)!"; \
fi \
done; \
fi \
done; \
echo "--------------- best = $$m ($$s/$$E) ---------------------------------"; \
if [ "$$s" != "" ]; then \
if (( $$(echo "$$m > ${MIN_BLEU_SCORE}" |bc -l) )); then \
x=`echo $$E | cut -f3 -d. | cut -f1 -d-`; \
y=`echo $$E | cut -f3 -d. | cut -f2 -d- | sed 's/[0-9]$$//'`; \
z=`echo $$E | cut -f2 -d.`; \
v=`echo $$E | cut -f4 -d.`; \
${MAKE} \
MODELSHOME=${PWD}/models \
PRE_SRC=$$x PRE_TRG=$$y \
DATASET=$$z \
MODELTYPE=$$v \
MODELS_URL=https://object.pouta.csc.fi/OPUS-MT-models dist-$$s; \
fi; \
fi
## make a package for distribution
## old: only accept models with a certain evaluation score:
# if [ `grep BLEU $(TEST_EVALUATION) | cut -f3 -d ' ' | cut -f1 -d '.'` -ge ${MIN_BLEU_SCORE} ]; then \
DATE = ${shell date +%F}
MODELS_URL = https://object.pouta.csc.fi/OPUS-MT-dev
SKIP_DIST_EVAL = 0
## determine pre-processing type
ifneq ("$(wildcard ${BPESRCMODEL})","")
PREPROCESS_TYPE = bpe
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
PREPROCESS_DESCRIPTION = normalization + tokenization + BPE
else
PREPROCESS_TYPE = spm
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
PREPROCESS_DESCRIPTION = normalization + SentencePiece
endif
ifneq (${words ${TRGLANGS}},1)
PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}-multi-target.sh
else
PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}.sh
endif
POSTPROCESS_SCRIPT = postprocess-${PREPROCESS_TYPE}.sh
## make the distribution package including test evaluation files and README
${DIST_PACKAGE}: ${MODEL_FINAL}
ifneq (${SKIP_DIST_EVAL},1)
@${MAKE} $(TEST_EVALUATION)
@${MAKE} $(TEST_COMPARISON)
endif
@mkdir -p ${dir $@}
@touch ${WORKDIR}/source.tcmodel
@echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
@echo '' >> ${WORKDIR}/README.md
@echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${PREPROCESS_TYPE}
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${PREPROCESS_TYPE}
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
@if [ ${words ${TRGLANGS}} -gt 1 ]; then \
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
>> ${WORKDIR}/README.md; \
fi
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
if [ -e $(TEST_EVALUATION) ]; then \
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
echo '## Benchmarks' >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
cd ${WORKDIR}; \
grep -H BLEU *.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
tr '.' '/' | cut -f1,5,6 -d '/' | tr '/' "." > $@.1; \
grep BLEU *.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | cut -f3 -d ' ' > $@.2; \
grep chrF *.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | cut -f3 -d ' ' > $@.3; \
echo '| testset | BLEU | chr-F |' >> README.md; \
echo '|-----------------------|-------|-------|' >> README.md; \
paste $@.1 $@.2 $@.3 | sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | sort | uniq >> README.md; \
rm -f $@.1 $@.2 $@.3; \
fi
@cat ${WORKDIR}/README.md >> ${dir $@}README.md
@echo '' >> ${dir $@}README.md
@cp models/LICENSE ${WORKDIR}/
@chmod +x ${WORKDIR}/preprocess.sh
@sed -e 's# - .*/\([^/]*\)$$# - \1#' \
-e 's/beam-size: [0-9]*$$/beam-size: 6/' \
-e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
-e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
-e 's/relative-paths: false/relative-paths: true/' \
< ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
@cd ${WORKDIR} && zip ${notdir $@} \
README.md LICENSE \
${notdir ${MODEL_FINAL}} \
${notdir ${MODEL_VOCAB}} \
${notdir ${MODEL_VALIDLOG}} \
${notdir ${MODEL_TRAINLOG}} \
source.* target.* decoder.yml preprocess.sh postprocess.sh
@mkdir -p ${dir $@}
@mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
if [ -e $(TEST_EVALUATION) ]; then \
cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt; \
cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt; \
fi
@rm -f $@
@cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
@rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
@rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
## do this only if the flag is set
## --> avoid expensive wildcard searches each time make is called
ifeq (${FIND_EVAL_FILES},1)
EVALSCORES = ${patsubst ${WORKHOME}/%.eval,${WORKHOME}/eval/%.eval.txt,${wildcard ${WORKHOME}/*/*.eval}}
EVALTRANSL = ${patsubst ${WORKHOME}/%.compare,${WORKHOME}/eval/%.test.txt,${wildcard ${WORKHOME}/*/*.compare}}
endif
## upload to Object Storage
## Don't forget to run this before uploading!
# source project_2000661-openrc.sh
#
# - make upload ......... released models = all sub-dirs in models/
# - make upload-models .. trained models in current WORKHOME to OPUS-MT-dev
# - make upload-scores .. score file with benchmark results to OPUS-MT-eval
# - make upload-eval .... benchmark tests from models in WORKHOME
# - make upload-images .. images of VMs that run OPUS-MT
upload:
find models/ -type l | tar -cf models-links.tar -T -
find models/ -type l -delete
cd models && swift upload OPUS-MT-models --changed --skip-identical *
tar -xf models-links.tar
rm -f models-links.tar
swift post OPUS-MT-models --read-acl ".r:*"
swift list OPUS-MT-models > index.txt
swift upload OPUS-MT-models index.txt
rm -f index.txt
upload-models:
find ${WORKHOME}/models -type l | tar -cf dev-models-links.tar -T -
find ${WORKHOME}/models -type l -delete
cd ${WORKHOME} && swift upload OPUS-MT-dev --changed --skip-identical models
tar -xf dev-models-links.tar
rm -f dev-models-links.tar
swift post OPUS-MT-dev --read-acl ".r:*"
swift list OPUS-MT-dev > index.txt
swift upload OPUS-MT-dev index.txt
rm -f index.txt
upload-scores: scores
cd ${WORKHOME} && swift upload OPUS-MT-eval --changed --skip-identical eval/scores.txt
swift post OPUS-MT-eval --read-acl ".r:*"
upload-eval: scores
cd ${WORKHOME} && swift upload OPUS-MT-eval --changed --skip-identical eval
swift post OPUS-MT-eval --read-acl ".r:*"
upload-images:
cd ${WORKHOME} && swift upload OPUS-MT --changed --skip-identical \
--use-slo --segment-size 5G opusMT-images
swift post OPUS-MT-images --read-acl ".r:*"
## this is for the multeval scores
# ${WORKHOME}/eval/scores.txt: ${EVALSCORES}
# cd ${WORKHOME} && \
# grep base */*eval | cut -f1,2- -d '/' | cut -f1,6- -d '.' | \
# sed 's/-/ /' | sed 's/\// /' | sed 's/ ([^)]*)//g' |\
# sed 's/.eval:baseline//' | sed "s/ */\t/g" | sort > $@
${WORKHOME}/eval/scores.txt: ${EVALSCORES} ${EVALTRANSL}
cd ${WORKHOME} && grep BLEU */*k${NR}.*eval | cut -f1 -d '/' | tr '-' "\t" > $@.1
cd ${WORKHOME} && grep BLEU */*k${NR}.*eval | tr '.' '/' | cut -f2,6,7 -d '/' | tr '/' "." > $@.2
cd ${WORKHOME} && grep BLEU */*k${NR}.*eval | cut -f3 -d ' ' > $@.3
cd ${WORKHOME} && grep chrF */*k${NR}.*eval | cut -f3 -d ' ' > $@.4
paste $@.1 $@.2 $@.3 $@.4 > $@
rm -f $@.1 $@.2 $@.3 $@.4
${EVALSCORES}: # ${WORKHOME}/eval/%.eval.txt: ${WORKHOME}/models/%.eval
mkdir -p ${dir $@}
cp ${patsubst ${WORKHOME}/eval/%.eval.txt,${WORKHOME}/%.eval,$@} $@
# cp $< $@
${EVALTRANSL}: # ${WORKHOME}/eval/%.test.txt: ${WORKHOME}/models/%.compare
mkdir -p ${dir $@}
cp ${patsubst ${WORKHOME}/eval/%.test.txt,${WORKHOME}/%.compare,$@} $@
# cp $< $@
# ## dangerous area ....
# delete-eval:
# swift delete OPUS-MT eval
######################################################################
## handle old models in previous work directories
## obsolete now?
######################################################################
##-----------------------------------
## make packages from trained models
## check old-models as well!
TRAINED_NEW_MODELS = ${patsubst ${WORKHOME}/%/,%,${dir ${wildcard ${WORKHOME}/*/*.best-perplexity.npz}}}
# TRAINED_OLD_MODELS = ${patsubst ${WORKHOME}/old-models/%/,%,${dir ${wildcard ${WORKHOME}/old-models/*/*.best-perplexity.npz}}}
TRAINED_OLD_MODELS = ${patsubst ${WORKHOME}/old-models/%/,%,${dir ${wildcard ${WORKHOME}/old-models/??-??/*.best-perplexity.npz}}}
TRAINED_OLD_ONLY_MODELS = ${filter-out ${TRAINED_NEW_MODELS},${TRAINED_OLD_MODELS}}
TRAINED_NEW_ONLY_MODELS = ${filter-out ${TRAINED_OLD_MODELS},${TRAINED_NEW_MODELS}}
TRAINED_DOUBLE_MODELS = ${filter ${TRAINED_NEW_MODELS},${TRAINED_OLD_MODELS}}
## make packages of all new models
## unless there are better models in old-models
new-models-dist:
@echo "nr of extra models: ${words ${TRAINED_NEW_ONLY_MODELS}}"
for l in ${TRAINED_NEW_ONLY_MODELS}; do \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" dist; \
done
@echo "trained double ${words ${TRAINED_DOUBLE_MODELS}}"
for l in ${TRAINED_DOUBLE_MODELS}; do \
n=`grep 'new best' work/$$l/*.valid1.log | tail -1 | cut -f12 -d ' '`; \
o=`grep 'new best' work/old-models/$$l/*.valid1.log | tail -1 | cut -f12 -d ' '`; \
if (( $$(echo "$$n < $$o" |bc -l) )); then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" dist; \
fi \
done
## fix decoder path in old-models (to run evaluations
fix-decoder-path:
for l in ${wildcard ${WORKHOME}/old-models/*/*.best-perplexity.npz.decoder.yml}; do \
sed --in-place=.backup 's#/\(..-..\)/opus#/old-models/\1/opus#' $$l; \
sed --in-place=.backup2 's#/old-models/old-models/#/old-models/#' $$l; \
sed --in-place=.backup2 's#/old-models/old-models/#/old-models/#' $$l; \
done
## make packages of all old models from old-models
## unless there are better models in work (new models)
old-models-dist:
@echo "nr of extra models: ${words ${TRAINED_OLD_ONLY_MODELS}}"
for l in ${TRAINED_OLD_ONLY_MODELS}; do \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" \
WORKHOME=${WORKHOME}/old-models \
MODELSHOME=${WORKHOME}/models dist; \
done
@echo "trained double ${words ${TRAINED_DOUBLE_MODELS}}"
for l in ${TRAINED_DOUBLE_MODELS}; do \
n=`grep 'new best' work/$$l/*.valid1.log | tail -1 | cut -f12 -d ' '`; \
o=`grep 'new best' work/old-models/$$l/*.valid1.log | tail -1 | cut -f12 -d ' '`; \
if (( $$(echo "$$o < $$n" |bc -l) )); then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" \
WORKHOME=${WORKHOME}/old-models \
MODELSHOME=${WORKHOME}/models dist; \
else \
echo "$$l: new better than old"; \
fi \
done
## old models had slightly different naming conventions
LASTSRC = ${lastword ${SRCLANGS}}
LASTTRG = ${lastword ${TRGLANGS}}
MODEL_OLD = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.${LASTSRC}${LASTTRG}
MODEL_OLD_BASENAME = ${MODEL_OLD}.${MODELTYPE}.model${NR}
MODEL_OLD_FINAL = ${WORKDIR}/${MODEL_OLD_BASENAME}.npz.best-perplexity.npz
MODEL_OLD_VOCAB = ${WORKDIR}/${MODEL_OLD}.vocab.${MODEL_VOCABTYPE}
MODEL_OLD_DECODER = ${MODEL_OLD_FINAL}.decoder.yml
MODEL_TRANSLATE = ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
MODEL_OLD_TRANSLATE = ${WORKDIR}/${TESTSET}.${MODEL_OLD}${NR}.${MODELTYPE}.${SRC}.${TRG}
MODEL_OLD_VALIDLOG = ${MODEL_OLD}.${MODELTYPE}.valid${NR}.log
MODEL_OLD_TRAINLOG = ${MODEL_OLD}.${MODELTYPE}.train${NR}.log
link-old-models:
if [ ! -e ${MODEL_FINAL} ]; then \
if [ -e ${MODEL_OLD_FINAL} ]; then \
ln -s ${MODEL_OLD_FINAL} ${MODEL_FINAL}; \
ln -s ${MODEL_OLD_VOCAB} ${MODEL_VOCAB}; \
ln -s ${MODEL_OLD_DECODER} ${MODEL_DECODER}; \
fi \
fi
if [ ! -e ${MODEL_TRANSLATE} ]; then \
if [ -e ${MODEL_OLD_TRANSLATE} ]; then \
ln -s ${MODEL_OLD_TRANSLATE} ${MODEL_TRANSLATE}; \
fi \
fi
if [ ! -e ${WORKDIR}/${MODEL_VALIDLOG} ]; then \
if [ -e ${WORKDIR}/${MODEL_OLD_VALIDLOG} ]; then \
ln -s ${WORKDIR}/${MODEL_OLD_VALIDLOG} ${WORKDIR}/${MODEL_VALIDLOG}; \
ln -s ${WORKDIR}/${MODEL_OLD_TRAINLOG} ${WORKDIR}/${MODEL_TRAINLOG}; \
fi \
fi
rm -f ${MODEL_TRANSLATE}.eval
rm -f ${MODEL_TRANSLATE}.compare
ifneq (${DATASET},${OLDDATASET})
TRAINFILES = ${wildcard ${WORKDIR}/train/*${OLDDATASET}*.*}
MODELFILES = ${wildcard ${WORKDIR}/*${OLDDATASET}*.*}
DECODERFILES = ${wildcard ${WORKDIR}/*${OLDDATASET}*.decoder.yml}
endif
## fix model names from old style
## where models trained on a single corpus got the name
## of that corpus
## Now: always use 'opus' as the name of the default dataset
fix-model-names:
ifneq (${DATASET},${OLDDATASET})
for f in ${DECODERFILES}; do \
perl -i.bak -pe 's/${OLDDATASET}/${DATASET}/' $$f; \
done
for f in ${TRAINFILES}; do \
mv -f $$f `echo $$f | sed 's/${OLDDATASET}/${DATASET}/'`; \
ln -s `echo $$f | sed 's/${OLDDATASET}/${DATASET}/'` $$f; \
done
for f in ${MODELFILES}; do \
mv -f $$f `echo $$f | sed 's/${OLDDATASET}/${DATASET}/'`; \
done
endif

View File

@ -1,103 +0,0 @@
# -*-makefile-*-
DOCLEVEL_BENCHMARK_DATA = https://zenodo.org/record/3525366/files/doclevel-MT-benchmark-discomt2019.zip
## continue document-level training with a new context size
ifndef NEW_CONTEXT
NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE)))
endif
continue-doctrain:
mkdir -p ${WORKDIR}/${MODEL}
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}})
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz
${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc
## continue training with a new dataset
ifndef NEW_DATASET
NEW_DATASET = OpenSubtitles
endif
continue-datatrain:
mkdir -p ${WORKDIR}/${MODEL}
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}})
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz
if [ -e ${BPESRCMODEL} ]; then \
cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \
cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \
fi
if [ -e ${SPMSRCMODEL} ]; then \
cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \
cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \
fi
${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train
# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus"
## use the doclevel benchmark data sets
%-ost:
${MAKE} ost-datasets
${MAKE} SRCLANGS=en TRGLANGS=de \
TRAINSET=ost-train \
DEVSET=ost-dev \
TESTSET=ost-test \
DEVSIZE=100000 TESTSIZE=100000 HELDOUTSIZE=0 \
${@:-ost=}
ost-datasets: ${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz \
${DATADIR}/${PRE}/ost-train.de-en.clean.en.gz \
${DATADIR}/${PRE}/ost-dev.de-en.clean.de.gz \
${DATADIR}/${PRE}/ost-dev.de-en.clean.en.gz \
${DATADIR}/${PRE}/ost-test.de-en.clean.de.gz \
${DATADIR}/${PRE}/ost-test.de-en.clean.en.gz
.INTERMEDIATE: ${WORKHOME}/doclevel-MT-benchmark
## download the doc-level data set
${WORKHOME}/doclevel-MT-benchmark:
wget -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1
unzip -d ${dir $@} $@.zip
rm -f $@.zip
${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l de < $</train/ost.tok.de | gzip -c > $@
${DATADIR}/${PRE}/ost-train.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l en < $</train/ost.tok.en | gzip -c > $@
${DATADIR}/${PRE}/ost-dev.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l de < $</dev/ost.tok.de | gzip -c > $@
${DATADIR}/${PRE}/ost-dev.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l en < $</dev/ost.tok.en | gzip -c > $@
${DATADIR}/${PRE}/ost-test.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l de < $</test/ost.tok.de | gzip -c > $@
${DATADIR}/${PRE}/ost-test.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l en < $</test/ost.tok.en | gzip -c > $@

View File

@ -1,140 +0,0 @@
# -*-makefile-*-
#
# settings of the environment
# - essential tools and their paths
# - system-specific settings
#
## modules to be loaded in sbatch scripts
CPU_MODULES = gcc/6.2.0 mkl
GPU_MODULES = cuda-env/8 mkl
# GPU_MODULES = python-env/3.5.3-ml cuda-env/8 mkl
# job-specific settings (overwrite if necessary)
# HPC_EXTRA: additional SBATCH commands
NR_GPUS = 1
HPC_NODES = 1
HPC_DISK = 500
HPC_QUEUE = serial
HPC_GPUQUEUE = gpu
# HPC_MODULES = nlpl-opus python-env/3.4.1 efmaral moses
# HPC_MODULES = nlpl-opus moses cuda-env marian python-3.5.3-ml
HPC_MODULES = ${GPU_MODULES}
HPC_EXTRA =
MEM = 4g
THREADS = 1
WALLTIME = 72
## set variables with HPC prefix
ifndef HPC_TIME
HPC_TIME = ${WALLTIME}:00
endif
ifndef HPC_CORES
HPC_CORES = ${THREADS}
endif
ifndef HPC_MEM
HPC_MEM = ${MEM}
endif
# GPU = k80
GPU = p100
DEVICE = cuda
LOADCPU = module load ${CPU_MODULES}
LOADGPU = module load ${GPU_MODULES}
ifeq (${shell hostname},dx6-ibs-p2)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work-langid}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
else ifeq (${shell hostname},dx7-nkiel-4gpu)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work-langid}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
else ifneq ($(wildcard /wrk/tiedeman/research),)
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
# APPLHOME = ${USERAPPL}/tools
APPLHOME = /proj/memad/tools
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-langid
OPUSHOME = /proj/nlpl/data/OPUS
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
# MARIAN = /proj/nlpl/software/marian/1.2.0
# MARIAN = /appl/ling/marian
MARIAN = ${HOME}/appl_taito/tools/marian/build-gpu
MARIANCPU = ${HOME}/appl_taito/tools/marian/build-cpu
LOADMODS = ${LOADGPU}
else
# CSCPROJECT = project_2001194
# CSCPROJECT = project_2000309
CSCPROJECT = project_2002688
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
WORKHOME = ${shell realpath ${PWD}/work-langid}
APPLHOME = ${HOME}/projappl
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
OPUSHOME = /projappl/nlpl/data/OPUS
MOSESHOME = ${APPLHOME}/mosesdecoder
EFLOMAL_HOME = ${APPLHOME}/eflomal/
# MARIAN = ${APPLHOME}/marian/build
# MARIANCPU = ${APPLHOME}/marian/build
# MARIAN = ${APPLHOME}/marian-dev/build-spm
# MARIANCPU = ${APPLHOME}/marian-dev/build-cpu
# MARIANSPM = ${APPLHOME}/marian-dev/build-spm
MARIAN = ${APPLHOME}/marian-dev/build-new
MARIANCPU = ${APPLHOME}/marian-dev/build-new
MARIANSPM = ${APPLHOME}/marian-dev/build-new
# GPU_MODULES = cuda intel-mkl
GPU = v100
GPU_MODULES = python-env
# gcc/8.3.0 boost/1.68.0-mpi intel-mkl
CPU_MODULES = python-env
LOADMODS = echo "nothing to load"
HPC_QUEUE = small
endif
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
## other tools and their locations
WORDALIGN = ${EFLOMAL_HOME}align.py
ATOOLS = ${FASTALIGN_HOME}atools
MULTEVALHOME = ${APPLHOME}/multeval
MOSESSCRIPTS = ${MOSESHOME}/scripts
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
SNMTPATH = ${APPLHOME}/subword-nmt/subword_nmt
## SentencePiece
SPM_HOME = ${MARIANSPM}
# SORT = sort -T ${TMPDIR} -S 50% --parallel=${THREADS}
SORT = sort -T ${TMPDIR} --parallel=${THREADS}
SHUFFLE = ${shell which terashuf 2>/dev/null}
ifeq (${SHUFFLE},)
SHUFFLE = ${SORT} --random-sort
endif

View File

@ -1,263 +0,0 @@
# -*-makefile-*-
#
# generic implic targets that make our life a bit easier
## extension -all: run something over all language pairs, e.g.
## make wordalign-all
## this goes sequentially over all language pairs
## for the parallelizable version of this: look at %-all-parallel
%-all:
for l in ${ALL_LANG_PAIRS}; do \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \
done
# run something over all language pairs that have trained models
## - make eval-allmodels
## - make dist-allmodels
%-allmodels:
for l in ${ALL_LANG_PAIRS}; do \
m=`find ${WORKHOME}/$$l -maxdepth 1 -name '*.best-perplexity.npz' -printf "%f\n"`; \
for i in $$m; do \
s=`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`; \
t=`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`; \
d=`echo $$i | cut -f1 -d.`; \
x=`echo $$i | cut -f2 -d. | cut -f1 -d-`; \
y=`echo $$i | cut -f2 -d. | cut -f2 -d-`; \
v=`echo $$i | cut -f3 -d.`; \
echo "model = $$i"; \
echo "dataset = $$d"; \
echo "src-lang = $$s"; \
echo "trg-lang = $$t"; \
echo "pre-src = $$x"; \
echo "pre-trg = $$y"; \
echo "type = $$v"; \
${MAKE} \
SRCLANGS="$$s" TRGLANGS="$$t" \
DATASET=$$d \
PRE_SRC=$$x PRE_TRG=$$y \
MODELTYPE=$$v ${@:-allmodels=}; \
done \
done
## OLD: doesn't work for different model variants
##
# %-allmodels:
# for l in ${ALL_LANG_PAIRS}; do \
# if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz' | wc -l` -gt 0 ]; then \
# ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
# TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \
# fi \
# done
listallmodels:
@m=`find ${WORKDIR} -maxdepth 1 -name '*.best-perplexity.npz' -printf "%f\n"`; \
for i in $$m; do \
d=`echo $$i | cut -f1 -d.`; \
s=`echo $$i | cut -f2 -d. | cut -f1 -d-`; \
t=`echo $$i | cut -f2 -d. | cut -f1 -d-`; \
v=`echo $$i | cut -f3 -d.`; \
echo "model = $$i"; \
echo "dataset = $$d"; \
echo "pre-src = $$s"; \
echo "pre-trg = $$t"; \
echo "type = $$v"; \
done
## only bilingual models
%-allbilingual:
for l in ${ALL_BILINGUAL_MODELS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \
fi \
done
## only bilingual models
%-allmultilingual:
for l in ${ALL_MULTILINGUAL_MODELS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \
fi \
done
## run something over all language pairs but make it possible to do it in parallel, for example
## - make dist-all-parallel
%-all-parallel:
${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}})
## run a command that includes the langpair, for example
## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv"
## What is this good for?
## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially
%-run-for-langpair:
${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
${shell echo $@ | sed 's/__.*$$//'}
## right-to-left model
%-RL:
${MAKE} MODEL=${MODEL}-RL \
MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \
${@:-RL=}
## include all backtranslation data as well in training
## start from the pre-trained opus model if it exists
BT_MODEL = ${MODEL_SUBDIR}opus+bt${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.${MODEL_VOCABTYPE}
# %-add-backtranslations:
%-bt:
ifneq (${wildcard ${MODEL_FINAL}},)
ifeq (${wildcard ${BT_MODEL_START}},)
cp ${MODEL_FINAL} ${BT_MODEL_START}
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 \
MARIAN_EARLY_STOPPING=15 \
${@:-bt=}
# CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
# CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
## run a multigpu job (2 or 4 GPUs)
%-multigpu %-0123:
${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=})
%-twogpu %-gpu01:
${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=})
%-gpu23:
${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=}
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
%-cpu:
${MAKE} MARIAN=${MARIANCPU} \
LOADMODS='${LOADCPU}' \
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
${@:-cpu=}
## document level models
%-doc:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
${@:-doc=}
## sentence-piece models
%-spm:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=}
## sentence-piece models with space-separated strings
%-nospace:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-nospace} \
PRE=simple \
SPMEXTRA=--split_by_whitespace=false \
${@:-nospace=}
## with SPM models trained on monolingual data
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
SPMSRCMODEL=${SPMSRCMONO} \
SPMTRGMODEL=${SPMTRGMONO} \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-monospm=}
%-spm-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
MODELTYPE=transformer \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm-noalign=}
## sentence-piece models with langid-filtering (new default)
%-langid:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
PRE=simple \
${@:-langid=}
## sentence-piece models with langid-filtering (new default)
%-langid-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid-noalign} \
MODELTYPE=transformer \
PRE=simple \
${@:-langid-noalign=}
## BPE models
%-bpe:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe=}
%-bpe-align:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \
PRE=tok \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-align=}
%-bpe-memad:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-memad=}
%-bpe-old:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-old=}
## for the inbuilt sentence-piece segmentation:
# PRE_SRC=txt PRE_TRG=txt
# MARIAN=${MARIAN}-spm
# MODEL_VOCABTYPE=spm

View File

@ -1,227 +0,0 @@
# -*-makefile-*-
## evaluation tool
## fails on puhti
easse:
git clone https://github.com/feralvam/easse.git
cd $@ && pip install --user .
## do we need this?
text-simplification-evaluation:
git clone git@github.com:facebookresearch/text-simplification-evaluation.git
cd text-simplification-evaluation
pip install -e . --user
pip install --user -r requirements.txt
#---------------------------------------------------------------------
# simplification test set
#---------------------------------------------------------------------
simplification:
git clone https://github.com/cocoxu/simplification.git
testsets/en-en/simplification.en1.gz: simplification
mkdir -p ${dir $@}
cut -f2 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
${TOKENIZER}/detokenizer.perl -l en | \
gzip -c > $@
testsets/en-en/simplification.en2.gz: simplification
mkdir -p ${dir $@}
cut -f3 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
${TOKENIZER}/detokenizer.perl -l en | \
gzip -c > $@
simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz
#---------------------------------------------------------------------
# document-level data
#---------------------------------------------------------------------
simplewiki-docdata: ${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw \
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw
${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw: ${HOME}/work/SimplifyRussian/data/simplification_datasets/simplewiki_docs.csv
mkdir -p ${dir $@}
tail -n +2 $< | cut -f2 | sed 's/^"//;s/ "$$//' > $@.en1
tail -n +2 $< | cut -f3 | sed 's/^"//;s/ "$$//' > $@.en2
$(MOSESSCRIPTS)/training/clean-corpus-n.perl $@ en1 en2 $@.clean 0 ${MARIAN_MAX_LENGTH}
${TOKENIZER}/detokenizer.perl -l en < $@.clean.en1 > $@
${TOKENIZER}/detokenizer.perl -l en < $@.clean.en2 > $(@:.en1.raw=.en2.raw)
rm -f $@.en1 $@.en2 $@.clean.en1 $@.clean.en2
${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
@echo "done!"
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
head -1000 $< > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
head -1000 $< > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
head -2000 $< | tail -1000 > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
head -2000 $< | tail -1000 > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en1.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en1.raw
tail -n +2001 $< > $@
${DATADIR}/${PRE}/simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train.en-en.en2.raw: ${DATADIR}/simplify/simplewiki_v2_doc${MARIAN_MAX_LENGTH}.en-en.en2.raw
tail -n +2001 $< > $@
#---------------------------------------------------------------------
# data from https://cs.pomona.edu/~dkauchak/simplification/
#---------------------------------------------------------------------
SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/
SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/
SIMPLEWIKI_DATA1 = data.v1.split
SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2
SIMPLEWIKI_DATA2_DOC = document-aligned.v2
# v1 - standard split
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
mkdir -p ${dir $@}
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
tar -C ${dir $@} -xzf $@.tar.gz
rm -f $@.tar.gz
${TOKENIZER}/detokenizer.perl -l en < $@/normal.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/simple.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/normal.tuning.txt > ${DATADIR}/${PRE}/simplewiki_v1-tuning.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/simple.tuning.txt > ${DATADIR}/${PRE}/simplewiki_v1-tuning.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/normal.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw
## v2 - sentence aligned - my split
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}:
mkdir -p ${dir $@}
wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz
tar -C ${dir $@} -xzf $@.tar.gz
rm -f $@.tar.gz
cut -f3 $@/normal.aligned | tail -n +10001 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en1.raw
cut -f3 $@/simple.aligned | tail -n +10001 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en2.raw
cut -f3 $@/normal.aligned | head -10000 | tail -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en1.raw
cut -f3 $@/simple.aligned | head -10000 | tail -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en2.raw
cut -f3 $@/normal.aligned | head -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en1.raw
cut -f3 $@/simple.aligned | head -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en2.raw
simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
## train a simplification model from simplewiki for English
%-simplewiki-v1-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v1 \
BPEMODELNAME=simplewiki_v1 \
TRAINSET=simplewiki_v1-training \
DEVSET=simplewiki_v1-tuning \
TESTSET=simplewiki_v1-testing \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplewiki-v1-english=}
%-simplewiki-v2sent-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v2_sent \
BPEMODELNAME=simplewiki_v2_sent \
TRAINSET=simplewiki_v2_sent-training \
DEVSET=simplewiki_v2_sent-tuning \
TESTSET=simplewiki_v2_sent-testing \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplewiki-v2sent-english=}
%-simplewiki-v2doc-english: simplewiki-docdata
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v2_doc \
BPEMODELNAME=simplewiki_v2_doc${MARIAN_MAX_LENGTH} \
TRAINSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train \
DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \
TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \
HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \
SRCLANGS=en TRGLANGS=en \
MARIAN_VALID_FREQ=1000 \
MARIAN_WORKSPACE=5000 \
MARIAN_MAX_LENGTH=500 \
HPC_MEM=12g \
${@:-simplewiki-v2doc-english=}
# MARIAN_EXTRA="--max-length-crop" \
%-simplewiki-v2sent+doc-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT} simplewiki-docdata
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v2_sent+doc${MARIAN_MAX_LENGTH} \
BPEMODELNAME=simplewiki_v2-sent+doc${MARIAN_MAX_LENGTH} \
TRAINSET="simplewiki_v2_doc${MARIAN_MAX_LENGTH}-train simplewiki_v2_sent-training" \
DEVSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-dev \
TESTSET=simplewiki_v2_doc${MARIAN_MAX_LENGTH}-test \
HELDOUTSIZE=0 MAX_NR_TOKENS=${MARIAN_MAX_LENGTH} \
SRCLANGS=en TRGLANGS=en \
MARIAN_VALID_FREQ=1000 \
MARIAN_WORKSPACE=5000 \
HPC_MEM=16g \
${@:-simplewiki-v2sent+doc-english=}
#---------------------------------------------------------------------
# data from https://github.com/XingxingZhang/dress
#---------------------------------------------------------------------
SIMPLEWIKI_LARGE_URL = https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2
SIMPLEWIKI_LARGE = data-simplification/wikilarge
${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}:
mkdir -p ${dir $@}
wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL}
tar -C ${dir $@} -xf $@.tar.bz2
rm -f $@.tar.bz2
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.dst > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en2.raw
simplelarge-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
%-simplewikilarge-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_large \
BPEMODELNAME=simplewiki_large \
TRAINSET=simplewiki_large-train \
DEVSET=simplewiki_large-tune \
TESTSET=simplewiki_large-test \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplewikilarge-english=}

View File

@ -1,99 +0,0 @@
# -*-makefile-*-
# enable e-mail notification by setting EMAIL
WHOAMI = $(shell whoami)
ifeq ("$(WHOAMI)","tiedeman")
EMAIL = jorg.tiedemann@helsinki.fi
endif
##---------------------------------------------
## submit jobs
##---------------------------------------------
## submit job to gpu queue
%.submit:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "${LANGSTR}-${DATASET}-${@:.submit=}"' >>$@
echo '#SBATCH -o ${LANGSTR}-${DATASET}-${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e ${LANGSTR}-${DATASET}-${@:.submit=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
echo '#SBATCH --exclude=r18g08' >> $@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
endif
echo '#SBATCH -n 1' >> $@
echo '#SBATCH -N 1' >> $@
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
ifeq (${shell hostname --domain},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}' >> $@
else
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS}' >> $@
endif
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
echo 'module use -a /proj/nlpl/modules' >> $@
for m in ${GPU_MODULES}; do \
echo "module load $$m" >> $@; \
done
echo 'module list' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@
echo 'srun ${MAKE} ${MAKEARGS} ${@:.submit=}' >> $@
echo 'echo "Finishing at `date`"' >> $@
sbatch $@
mkdir -p ${WORKDIR}
mv $@ ${WORKDIR}/$@
# echo 'srun ${MAKE} NR=${NR} MODELTYPE=${MODELTYPE} DATASET=${DATASET} SRC=${SRC} TRG=${TRG} PRE_SRC=${PRE_SRC} PRE_TRG=${PRE_TRG} ${MAKEARGS} ${@:.submit=}' >> $@
## submit job to cpu queue
%.submitcpu:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "${@:.submitcpu=}"' >>$@
echo '#SBATCH -o ${@:.submitcpu=}.out.%j' >> $@
echo '#SBATCH -e ${@:.submitcpu=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
endif
ifeq (${shell hostname --domain},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@
# echo '#SBATCH --exclude=r05c49' >> $@
# echo '#SBATCH --exclude=r07c51' >> $@
# echo '#SBATCH --exclude=r06c50' >> $@
endif
echo '#SBATCH -n ${HPC_CORES}' >> $@
echo '#SBATCH -N ${HPC_NODES}' >> $@
echo '#SBATCH -p ${HPC_QUEUE}' >> $@
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
echo '${HPC_EXTRA}' >> $@
echo 'module use -a /proj/nlpl/modules' >> $@
for m in ${CPU_MODULES}; do \
echo "module load $$m" >> $@; \
done
echo 'module list' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@
echo '${MAKE} -j ${HPC_CORES} ${MAKEARGS} ${@:.submitcpu=}' >> $@
echo 'echo "Finishing at `date`"' >> $@
sbatch $@
mkdir -p ${WORKDIR}
mv $@ ${WORKDIR}/$@
# echo '${MAKE} -j ${HPC_CORES} DATASET=${DATASET} SRC=${SRC} TRG=${TRG} PRE_SRC=${PRE_SRC} PRE_TRG=${PRE_TRG} ${MAKEARGS} ${@:.submitcpu=}' >> $@

View File

@ -1,981 +0,0 @@
# -*-makefile-*-
#
# pre-defined tasks that we might want to run
#
include Makefile.simplify
MEMAD_LANGS = de en fi fr nl sv
# GERMANIC = en de nl fy af da fo is no nb nn sv
GERMANIC = de nl fy af da fo is no nb nn sv
WESTGERMANIC = de nl af fy
SCANDINAVIAN = da fo is no nb nn sv
ROMANCE = ca es fr gl it la oc pt_br pt ro
FINNO_UGRIC = fi et hu
PIVOT = en
ifndef LANGS
LANGS = ${MEMAD_LANGS}
endif
unidirectional:
${MAKE} data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
bilingual:
${MAKE} data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
${MAKE} reverse-data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
train.submit-multigpu
bilingual-big:
${MAKE} data
${MAKE} WALLTIME=72 HPC_MEM=8g HPC_CORES=1 train.submit-multigpu
${MAKE} reverse-data
${MAKE} WALLTIME=72 HPC_MEM=8g HPC_CORES=1 SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
train.submit-multigpu
bilingual-medium:
${MAKE} data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 \
MARIAN_VALID_FREQ=5000 MARIAN_WORKSPACE=10000 train.submit
${MAKE} reverse-data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
MARIAN_VALID_FREQ=5000 MARIAN_WORKSPACE=10000 train.submit
bilingual-small:
${MAKE} data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 \
MARIAN_WORKSPACE=5000 MARIAN_VALID_FREQ=2500 train.submit
${MAKE} reverse-data
${MAKE} WALLTIME=72 HPC_MEM=4g HPC_CORES=1 SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
MARIAN_WORKSPACE=5000 MARIAN_VALID_FREQ=2500 train.submit
multilingual:
${MAKE} SRCLANGS="${LANGS}" TRGLANGS="${LANGS}" data
${MAKE} SRCLANGS="${LANGS}" TRGLANGS="${LANGS}" \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g train.submit-multigpu
multilingual-big:
${MAKE} SRCLANGS="${LANGS}" TRGLANGS="${LANGS}" data
${MAKE} SRCLANGS="${LANGS}" TRGLANGS="${LANGS}" \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g train.submit-multigpu
multilingual-medium:
${MAKE} SRCLANGS="${LANGS}" TRGLANGS="${LANGS}" data
${MAKE} SRCLANGS="${LANGS}" TRGLANGS="${LANGS}" \
MARIAN_VALID_FREQ=5000 MARIAN_WORKSPACE=10000 \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g train.submit-multigpu
all2pivot:
for l in ${filter-out ${PIVOT},${LANGS}}; do \
${MAKE} SRCLANGS="$$l" TRGLANGS="${PIVOT}" data; \
${MAKE} SRCLANGS="$$l" TRGLANGS="${PIVOT}" HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
${MAKE} SRCLANGS="$$l" TRGLANGS="${PIVOT}" reverse-data; \
${MAKE} SRCLANGS="${PIVOT}" TRGLANGS="$$l" HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
done
## submit train jobs with settings that depend on the size of the training data
## --> change WORKSPACE, MEM, nr of GPUs, validation frequency, stopping criterion
train-dynamic:
if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
${MAKE} data; \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
if [ $$s -gt 10000000 ]; then \
echo "${LANGPAIRSTR} bigger than 10 million"; \
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
elif [ $$s -gt 1000000 ]; then \
echo "${LANGPAIRSTR} bigger than 1 million"; \
${MAKE} \
MARIAN_VALID_FREQ=2500 \
HPC_CORES=1 HPC_MEM=8g train.submit; \
elif [ $$s -gt 100000 ]; then \
echo "${LANGPAIRSTR} bigger than 100k"; \
${MAKE} \
MARIAN_VALID_FREQ=1000 \
MARIAN_WORKSPACE=5000 \
MARIAN_VALID_MINI_BATCH=8 \
HPC_CORES=1 HPC_MEM=4g train.submit; \
elif [ $$s -gt 10000 ]; then \
echo "${LANGPAIRSTR} bigger than 10k"; \
${MAKE} \
MARIAN_WORKSPACE=3500 \
MARIAN_VALID_MINI_BATCH=4 \
MARIAN_DROPOUT=0.5 \
MARIAN_VALID_FREQ=1000 \
HPC_CORES=1 HPC_MEM=4g train.submit; \
else \
echo "${LANGPAIRSTR} too small"; \
fi \
fi
# MARIAN_EARLY_STOPPING=5 \
bilingual-dynamic: train-dynamic
if [ "${SRCLANGS}" != "${TRGLANGS}" ]; then \
${MAKE} reverse-data; \
${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' train-dynamic; \
fi
#-------------------------------------------------------------------
# OPUS-MT
#-------------------------------------------------------------------
# iso639 = aa ab ae af ak am an ar as av ay az ba be bg bh bi bm bn bo br bs ca ce ch cn co cr cs cu cv cy da de dv dz ee el en eo es et eu fa ff fi fj fo fr fy ga gd gl gn gr gu gv ha hb he hi ho hr ht hu hy hz ia id ie ig ik io is it iu ja jp jv ka kg ki kj kk kl km kn ko kr ks ku kv kw ky la lb lg li ln lo lt lu lv me mg mh mi mk ml mn mo mr ms mt my na nb nd ne ng nl nn no nr nv ny oc oj om or os pa pi pl po ps pt qu rm rn ro ru rw ry sa sc sd se sg sh si sk sl sm sn so sq sr ss st su sv sw ta tc te tg th ti tk tl tn to tr ts tt tw ty ua ug uk ur uz ve vi vo wa wo xh yi yo za zh zu
# NO_MEMAD = ${filter-out fi sv de fr nl,${iso639}}
#"de_AT de_CH de_DE de"
#"en_AU en_CA en_GB en_NZ en_US en_ZA en"
#"it_IT if"
#"es_AR es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE es"
#"eu_ES eu"
#"hi_IN hi"
#"fr_BE fr_CA fr_FR fr"
#"fa_AF fa_IR fa"
#"ar_SY ar_TN ar"
#"bn_IN bn"
#da_DK
#bg_BG
#nb_NO
#nl_BE nl_NL
#tr_TR
### ze_en - English subtitles in chinese movies
OPUSLANGS = fi sv fr es de ar he "cmn cn yue ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue zhs zht zh" "pt_br pt_BR pt_PT pt" aa ab ace ach acm acu ada ady aeb aed ae afb afh af agr aha aii ain ajg aka ake akl ak aln alt alz amh ami amu am ang an aoc aoz apc ara arc arh arn arq ary arz ase asf ast as ati atj avk av awa aym ay azb "az_IR az" bal bam ban bar bas ba bbc bbj bci bcl bem ber "be_tarask be" bfi bg bho bhw bh bin bi bjn bm bn bnt bo bpy brx br bsn bs btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs cab cac cak cat cay ca "cbk_zam cbk" cce cdo ceb ce chf chj chk cho chq chr chw chy ch cjk cjp cjy ckb ckt cku cmo cnh cni cop co "crh_latn crh" crp crs cr csb cse csf csg csl csn csr cs cto ctu cuk cu cv cycl cyo cy daf da dga dhv dik din diq dje djk dng dop dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml enm eo esn et eu ewo ext fan fat fa fcs ff fil fj fkv fon foo fo frm frp frr fse fsl fuc ful fur fuv fy gaa gag gan ga gbi gbm gcf gcr gd gil glk gl gn gom gor gos got grc gr gsg gsm gss gsw guc gug gum gur guw gu gv gxx gym hai hak hau haw ha haz hb hch hds hif hi hil him hmn hne hnj hoc ho hrx hr hsb hsh hsn ht hup hus hu hyw hy hz ia iba ibg ibo id ie ig ike ik ilo inh inl ins io iro ise ish iso is it iu izh jak jam jap ja jbo jdt jiv jmx jp jsl jv kaa kab kac kam kar kau ka kbd kbh kbp kea kek kg kha kik kin ki kjh kj kk kl kmb kmr km kn koi kok kon koo ko kpv kqn krc kri krl kr ksh kss ksw ks kum ku kvk kv kwn kwy kw kxi ky kzj lad lam la lbe lb ldn lez lfn lg lij lin liv li lkt lld lmo ln lou lo loz lrc lsp ltg lt lua lue lun luo lus luy lu lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf men me mfe mfs mgm mgr mg mhr mh mic min miq mi mk mlg ml mnc mni mnw mn moh mos mo mrj mrq mr "ms_MY ms" mt mus mvv mwl mww mxv myv my mzn mzy nah nan nap na nba "nb_NO nb nn_NO nn nog no_nb no" nch nci ncj ncs ncx ndc "nds_nl nds" nd new ne ngl ngt ngu ng nhg nhk nhn nia nij niu nlv nl nnh non nov npi nqo nrm nr nso nst nv nya nyk nyn nyu ny nzi oar oc ojb oj oke olo om orm orv or osx os ota ote otk pag pam pan pap pau pa pbb pcd pck pcm pdc pdt pes pfl pid pih pis pi plt pl pms pmy pnb pnt pon pot po ppk ppl prg prl prs pso psp psr ps pys quc que qug qus quw quy qu quz qvi qvz qya rap rar rcf rif rmn rms rmy rm rnd rn rom ro rsl rue run rup ru rw ry sah sat sa sbs scn sco sc sd seh se sfs sfw sgn sgs sg shi shn shs shy sh sid simple si sjn sk sl sma sml sm sna sn som son sop sot so sqk sq "sr_ME sr srp" srm srn ssp ss stq st sux su svk swa swc swg swh sw sxn syr szl "ta_LK ta" tcf tcy tc tdt tdx tet te "tg_TJ tg" thv th tig tir tiv ti tkl tk tlh tll "tl_PH tl" tly tmh tmp tmw tn tob tog toh toi toj toki top to tpi tpw trv tr tsc tss ts tsz ttj tt tum tvl tw tyv ty tzh tzl tzo udm ug uk umb urh "ur_PK ur" usp uz vec vep ve "vi_VN vi" vls vmw vo vro vsl wae wal war wa wba wes wls wlv wol wo wuu xal xho xh xmf xpe yao yap yaq ybb yi yor yo yua zab zai zam za zdj zea zib zlm zne zpa zpg zsl zsm "zul zu" zza
allopus2pivot:
for l in ${filter-out ${PIVOT},${OPUSLANGS}}; do \
${MAKE} WALLTIME=72 SRCLANGS="$$l" bilingual-dynamic; \
done
## this looks dangerous ....
allopus:
for s in ${OPUSLANGS}; do \
for t in ${OPUSLANGS}; do \
if [ ! -e "${WORKHOME}/$$s-$$t/train.submit" ]; then \
echo "${MAKE} WALLTIME=72 SRCLANGS=\"$$s\" SRCLANGS=\"$$t\" bilingual-dynamic"; \
${MAKE} WALLTIME=72 SRCLANGS="$$s" TRGLANGS="$$t" bilingual-dynamic; \
fi \
done \
done
all2en:
${MAKE} PIVOT=en allopus2pivot
#-------------------------------------------------------------------
# wikimedia tasks
#-------------------------------------------------------------------
as-en:
${MAKE} data-as-en
${MAKE} train-dynamic-as-en
${MAKE} reverse-data-as-en
${MAKE} train-dynamic-en-as
# ENAS_BPE = 4000
ENAS_BPE = 1000
%-as-en:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=${ENAS_BPE} \
SRCLANGS="as" TRGLANGS="en" \
${@:-as-en=}
%-en-as:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=${ENAS_BPE} \
SRCLANGS="en" TRGLANGS="as" \
${@:-en-as=}
#-------------------------------------------------------------------
# important secondary langs in Finland
#-------------------------------------------------------------------
fi-so:
${MAKE} data-fi-so
${MAKE} train-dynamic-fi-so
${MAKE} reverse-data-fi-so
${MAKE} train-dynamic-so-fi
%-fi-so:
${MAKE} HELDOUTSIZE=0 BPESIZE=4000 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 \
SRCLANGS=fi TRGLANGS=so data \
${@:-fi-so=}
%-so-fi:
${MAKE} HELDOUTSIZE=0 BPESIZE=4000 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 \
SRCLANGS=so TRGLANGS=fi data \
${@:-so-fi=}
fi-xx:
for l in ru et ar so ku fa sq vi th tr es pl; do \
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=fi \
HPC_MEM=12g HPC_CORES=2 bilingual-dynamic.submitcpu; \
done
en-xx:
for l in so ku fa sq vi th; do \
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=en \
HPC_MEM=12g HPC_CORES=2 bilingual-dynamic.submitcpu; \
done
fi-zh:
${MAKE} SRCLANGS=fi \
TRGLABGS="cmn cn yue ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue zhs zht zh" \
HPC_MEM=12g HPC_CORES=2 \
train-dynamic.submitcpu
${MAKE} TRGLANGS=fi \
SRCLABGS="cmn cn yue ze_zh zh_cn zh_CN zh_HK zh_tw zh_TW zh_yue zhs zht zh" \
HPC_MEM=12g HPC_CORES=2 \
train-dynamic.submitcpu
#-------------------------------------------------------------------
# experiments and models from the fiskmö project
#-------------------------------------------------------------------
## run things with individual data sets only
%-fiskmo:
${MAKE} TRAINSET=fiskmo ${@:-fiskmo=}
%-opensubtitles:
${MAKE} TRAINSET=OpenSubtitles ${@:-opensubtitles=}
%-finlex:
${MAKE} TRAINSET=Finlex ${@:-finlex=}
## make some tests with crawled fiskmo data
FISKMO-DATASETS = crawl-v2-2M \
crawl-v2-clean \
yle-rss-v2-100K \
yle-rss-v2-clean \
fiskmo-crawl-articles-v1 \
fiskmo-crawl-articles-v1-0.5 \
fiskmo-crawl-articles-v1-0.8 \
yle-2011-2018-articles-v1-0.8
## make fiskmo-fisv-data
## make fiskmo-fisv-train.submit
## make fiskmo-fisv-eval
## make fiskmo-fisv-eval-testsets
##
## make fiskmo-fisv-reverse-data
## make fiskmo-svfi-train.submit
## make fiskmo-fisv-eval
## make fiskmo-fisv-eval-testsets
fiskmo-missing:
for d in crawl-v2-2M crawl-v2-clean; do \
rm -f ${WORKHOME}/fi-sv/*.submit; \
${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \
MODELTYPE=transformer train-dynamic; \
done
fiskmo-svfi-missing:
for d in crawl-v2-clean; do \
rm -f ${WORKHOME}/sv-fi/*.submit; \
${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \
train-dynamic; \
done
fiskmo-fisv-%:
for d in ${FISKMO-DATASETS}; do \
rm -f ${WORKHOME}/fi-sv/*.submit; \
${MAKE} SRCLANGS=fi TRGLANGS=sv DATASET=$$d TRAINSET=finnish-swedish-$$d \
${patsubst fiskmo-fisv-%,%,$@}; \
done
rm -f ${WORKHOME}/fi-sv/*.submit
${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=fi TRGLANGS=sv \
TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \
${patsubst fiskmo-fisv-%,%,$@}
rm -f ${WORKHOME}/fi-sv/*.submit
${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=fi TRGLANGS=sv \
TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \
${patsubst fiskmo-fisv-%,%,$@}
fiskmo-svfi-%:
for d in ${FISKMO-DATASETS}; do \
rm -f ${WORKHOME}/sv-fi/*.submit; \
${MAKE} SRCLANGS=sv TRGLANGS=fi DATASET=$$d TRAINSET=finnish-swedish-$$d \
${patsubst fiskmo-svfi-%,%,$@}; \
done
rm -f ${WORKHOME}/sv-fi/*.submit
${MAKE} DATASET=fiskmo-crawl-all SRCLANGS=sv TRGLANGS=fi \
TRAINSET="finnish-swedish-crawl-v2-2M yle-rss-v2-100K fiskmo-crawl-articles-v1 yle-2011-2018-articles-v1-0.8" \
${patsubst fiskmo-svfi-%,%,$@}
rm -f ${WORKHOME}/sv-fi/*.submit
${MAKE} DATASET=fiskmo-crawl-clean SRCLANGS=sv TRGLANGS=fi \
TRAINSET="finnish-swedish-crawl-v2-clean yle-rss-v2-clean fiskmo-crawl-articles-v1-0.8" \
${patsubst fiskmo-svfi-%,%,$@}
#-------------------------------------------------------------------
# add THL backtranslation data (and also all other backtranslations)
#-------------------------------------------------------------------
%-thl:
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} BACKTRANS_SRC="${BACKTRANS_SRC} ${wildcard backtranslate/thl/${TRG}-${SRC}/latest/*.${SRCEXT}.gz}" \
BACKTRANS_TRG="${BACKTRANS_TRG} ${wildcard backtranslate/thl/${TRG}-${SRC}/latest/*.${TRGEXT}.gz}" \
DATASET=${DATASET}+bt+thl USE_BACKTRANS=1 \
MARIAN_EARLY_STOPPING=10 \
${@:-thl=}
#-------------------------------------------------------------------
# models for Celtic languages
#-------------------------------------------------------------------
## only OPUS data
# CELTIC_BPESIZE = 12000
CELTIC_BPESIZE = 4000
%-celtic-english-opus:
${MAKE} HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} SRCLANGS="ga cy br gd kw gv" TRGLANGS=en ${@:-celtic-english-opus=}
%-english-celtic-opus:
${MAKE} HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en ${@:-english-celtic-opus=}
# more data for cy-en
%-celtic-english: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
SRCLANGS="ga cy br gd kw gv" TRGLANGS=en \
${@:-celtic-english=}
%-english-celtic: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en \
${@:-english-celtic=}
## extra data from http://techiaith.cymru
# http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
# http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
# http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
# http://techiaith.cymru/alinio/rhestr_geiriau.tsv
# http://techiaith.cymru/alinio/hunalign/cy-en.dic
.PHONY: welsh-data
welsh-data: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
for c in CofnodYCynulliad Deddfwriaeth Meddalwedd; do \
wget http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \
tar -xzf $$c.tar.gz; \
$(TOKENIZER)/detokenizer.perl -l cy < $$c.cy |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.cy.gz; \
$(TOKENIZER)/detokenizer.perl -l en < $$c.en |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.en.gz; \
rm -f $$c.tar.gz; \
done
wget http://techiaith.cymru/alinio/rhestr_geiriau.tsv
tail -n +16 rhestr_geiriau.tsv | cut -f1 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.en.gz
tail -n +16 rhestr_geiriau.tsv | cut -f2 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.cy.gz
rm -f rhestr_geiriau.tsv
wget http://techiaith.cymru/alinio/hunalign/cy-en.dic
cut -f1 -d '@' < cy-en.dic | sed 's/ $$*//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.en.gz
cut -f2 -d '@' < cy-en.dic | sed 's/^ *//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${DATADIR}/${PRE}/%.cy-en.clean.cy.gz:
wget http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
tar -xzf $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
$(TOKENIZER)/detokenizer.perl -l cy < $(patsubst %.cy-en.clean.cy.gz,%.cy,${notdir $@}) |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > $@
$(TOKENIZER)/detokenizer.perl -l en < $(patsubst %.cy-en.clean.cy.gz,%.en,${notdir $@}) |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${@:.cy.gz=.en.gz}
rm -f $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
#-------------------------------------------------------------------
# multilingual model for Sami languages
#-------------------------------------------------------------------
# sami-data: fetch-sami-tmx convert-sami-tmx move-sami-data convert-sami-gloss
sami-data: fetch-sami-tmx convert-sami-tmx merge-sami-data convert-sami-gloss
${MAKE} data-sami
sami-train: train-dynamic-sami
sami-eval: eval-sami
sami-dist: dist-sami
GIELLATEKNO_HOME = https://victorio.uit.no/biggies/trunk
GIELLATEKNO_TM_HOME = ${GIELLATEKNO_HOME}/mt/omegat
GIELLATEKNO_SAMI_TM = fin-smn/tm/finsmn.tmx \
fin-sme/tm/finsme.tmx \
fin-sms/tm/finsms.tmx \
sme-smn/tm/smesmn.tmx \
sme-smj/tm/smesmj.tmx \
sme-nob/tm/smenob.tmx \
sme-sma/tm/smesma.tmx \
nob-smj/tm/nobsmj.tmx \
nob-sme/tm/nobsme-2012.tmx \
nob-sme/tm/nobsme-admin.tmx \
nob-sme/tm/nobsme-bible.tmx \
nob-sme/tm/nobsme-facta.tmx \
nob-sme/tm/nobsme-laws.tmx \
nob-sme/tm/nobsme-science.tmx \
nob-sma/tm/nobsma.tmx \
sma-nob/tm/smanob.tmx
## glossaries
convert-sami-gloss:
wget ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8
cut -f1 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz
cut -f2 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.smn.gz
rm -f finsmn.utf8
wget ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8
cut -f1 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-se.clean.fi.gz
cut -f2 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-se.clean.se.gz
rm -f finsme.utf8
wget ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8
cut -f1 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.fi.gz
cut -f2 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.sms.gz
rm -f finsms.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8
cut -f1 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smn.clean.se.gz
cut -f2 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smn.clean.smn.gz
rm -f smesmn.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smj.clean.se.gz
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smj.clean.smj.gz
rm -f glossary.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8
cut -f1 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-se.clean.se
cut -f2 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-se.clean.nb
cut -f1 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-se.clean.se
cut -f2 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-se.clean.nb
gzip -f ${DATADIR}/${PRE}/glossary.nb-se.clean.se
gzip -f ${DATADIR}/${PRE}/glossary.nb-se.clean.nb
rm -f smenob.utf8 termwiki.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-sma.clean.se.gz
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-sma.clean.sma.gz
rm -f glossary.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8
cut -f1 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.nb.gz
cut -f2 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.smj.gz
rm -f nobsmj.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8
cut -f1 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-se.clean.nb.gz
cut -f2 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-se.clean.se.gz
rm -f nobsme.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8
wget ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8
cut -f1 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
cut -f2 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
cut -f1 termwiki.utf8 >>${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
cut -f2 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
gzip -f ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
gzip -f ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
rm -f nobsma.utf8 termwiki.utf8
fetch-sami-tmx: ${GIELLATEKNO_SAMI_TM}
convert-sami-tmx:
for t in ${GIELLATEKNO_SAMI_TM}; do \
mkdir -p ${DATADIR}/sami; \
tmx2moses -r -o ${DATADIR}/sami/`echo -n $$t | xargs basename | sed 's/.tmx//'` $$t; \
done
## OLD: individual file names
move-sami-data:
for f in `ls ${DATADIR}/sami`; do \
gzip -c < ${DATADIR}/sami/$$f \
> ${DATADIR}/${PRE}/`echo -n $$f | sed 's/\.\([^.]*\)$$/.clean.\1.gz/'`; \
done
## NEW: merge all giellatekno TMs into one corpus
merge-sami-data:
for s in fi nb se sma smj smn; do \
for t in fi nb se sma smj smn; do \
if [ `ls ${DATADIR}/sami/*.$$s-$$t.$$s 2>/dev/null | wc -l` -gt 0 ]; then \
p=`echo "$$s $$t" | tr ' ' "\n" | sort | tr "\n" '-' | sed 's/\-$$//'`; \
cat ${DATADIR}/sami/*.$$s-$$t.$$s >> ${DATADIR}/${PRE}/giella.$$p.clean.$$s; \
cat ${DATADIR}/sami/*.$$s-$$t.$$t >> ${DATADIR}/${PRE}/giella.$$p.clean.$$t; \
fi \
done \
done
for s in fi nb sma smj smn; do \
if [ `ls ${DATADIR}/sami/*.$$s-sme.$$s 2>/dev/null | wc -l` -gt 0 ]; then \
p=`echo "$$s se" | tr ' ' "\n" | sort | tr "\n" '-' | sed 's/\-$$//'`; \
cat ${DATADIR}/sami/*.$$s-sme.$$s >> ${DATADIR}/${PRE}/giella.$$p.clean.$$s; \
cat ${DATADIR}/sami/*.$$s-sme.sme >> ${DATADIR}/${PRE}/giella.$$p.clean.se; \
fi \
done
for t in fi nb sma smj smn; do \
if [ `ls ${DATADIR}/sami/*.sme-$$t.sem 2>/dev/null | wc -l` -gt 0 ]; then \
p=`echo "$$t se" | tr ' ' "\n" | sort | tr "\n" '-' | sed 's/\-$$//'`; \
cat ${DATADIR}/sami/*.sme-$$t.sme >> ${DATADIR}/${PRE}/giella.$$p.clean.se; \
cat ${DATADIR}/sami/*.sme-$$t.$$t >> ${DATADIR}/${PRE}/giella.$$p.clean.$$t; \
fi \
done
gzip -f ${DATADIR}/${PRE}/giella.*-*.clean.?? ${DATADIR}/${PRE}/giella.*-*.clean.???
${GIELLATEKNO_SAMI_TM}:
mkdir -p ${dir $@}
wget -O $@ ${GIELLATEKNO_TM_HOME}/$@
## name of the sami data sets
# SAMI_EXTRA = ${patsubst %.tmx,%,${notdir ${GIELLATEKNO_SAMI_TM}}} glossary
%-sami:
${MAKE} DATASET=${DATASET}+giella \
HELDOUTSIZE=0 \
BPESIZE=4000 \
DEVSET=giella \
TESTSET=giella \
DEVMINSIZE=100 \
EXTRA_TRAINSET="glossary" \
SRCLANGS="se sma smj smn sms vep et fi kv krl nb no nn ru sv en" \
TRGLANGS="se sma smj smn sms vep et fi kv krl nb no nn ru sv en" \
SKIP_LANGPAIRS="en-en|en-et|en-fi|en-nb|en-no|en-nn|en-ru|en-sv|et-et|et-fi|et-nb|et-no|et-nn|et-ru|et-sv|fi-fi|fi-nb|fi-no|fi-nn|fi-ru|fi-sv|nb-nb|nb-no|nb-nn|nb-ru|nb-sv|no-no|no-nn|no-ru|no-sv|nn-nn|nn-ru|nn-sv|ru-ru|ru-sv|sv-sv" \
${@:-sami=}
%-sami-xx:
${MAKE} DATASET=${DATASET}+giella \
HELDOUTSIZE=0 \
BPESIZE=4000 \
DEVSET=giella \
TESTSET=giella \
DEVMINSIZE=100 \
EXTRA_TRAINSET="glossary" \
SRCLANGS="se sma smj smn sms" \
TRGLANGS="fi nb no nn ru sv en" \
${@:-sami-xx=}
%-xx-sami:
${MAKE} DATASET=${DATASET}+giella \
HELDOUTSIZE=0 \
BPESIZE=4000 \
DEVSET=giella \
TESTSET=giella \
DEVMINSIZE=100 \
EXTRA_TRAINSET="glossary" \
TRGLANGS="se sma smj smn sms" \
SRCLANGS="fi nb no nn ru sv en" \
${@:-xx-sami=}
test-skip:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
echo "skip $$s-$$t"; \
fi \
done \
done
#-------------------------------------------------------------------
# Romance-languages - English models
#-------------------------------------------------------------------
LANGS_FR_VARIANTS = fr_BE fr_CA fr_FR
LANGS_ES_VARIANTS = es_AR es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE
LANGS_PT_VARIANTS = pt_br pt_BR pt_PT
LANGS_ROMANCE = fr ${LANGS_FR_VARIANTS} wa frp oc ca rm lld fur lij lmo es ${LANGS_ES_VARIANTS} pt ${LANGS_PT_VARIANTS} gl lad an mwl it it_IT co nap scn vec sc ro la
%-romance-english:
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en \
${@:-romance-english=}
%-english-romance:
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer TRGLANGS="${LANGS_ROMANCE}" SRCLANGS=en \
${@:-english-romance=}
#-------------------------------------------------------------------
# other language groups
#-------------------------------------------------------------------
germanic:
${MAKE} LANGS="${GERMANIC}" HPC_DISK=1500 multilingual
scandinavian:
${MAKE} LANGS="${SCANDINAVIAN}" multilingual-medium
#-------------------------------------------------------------------
# some other interesting language pairs
#-------------------------------------------------------------------
fiet:
${MAKE} SRCLANGS=fi TRGLANGS=et bilingual-medium
icelandic:
${MAKE} SRCLANGS=is TRGLANGS=en bilingual
${MAKE} SRCLANGS=is TRGLANGS="da no nn nb sv" bilingual
${MAKE} SRCLANGS=is TRGLANGS=fi bilingual
## include yandex data in training
enru-yandex:
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex reverse-data
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train.submit-multigpu
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g MARIAN_EARLY_STOPPING=15 train.submit-multigpu
enru-yandex-bt:
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data-bt
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=12g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train-bt.submit-multigpu
enit:
${MAKE} SRCLANGS=en TRGLANGS=it traindata-spm
${MAKE} SRCLANGS=en TRGLANGS=it devdata-spm
${MAKE} SRCLANGS=en TRGLANGS=it wordalign-spm
${MAKE} SRCLANGS=en TRGLANGS=it WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
#-------------------------------------------------------------------
# document-level MT (concatenation approach)
#-------------------------------------------------------------------
doclevel:
${MAKE} ost-datasets
${MAKE} traindata-doc-ost
${MAKE} devdata-doc-ost
${MAKE} wordalign-doc-ost
${MAKE} CONTEXT_SIZE=${CONTEXT_SIZE} MODELTYPE=${MODELTYPE} \
HPC_CORES=1 WALLTIME=72 HPC_MEM=4g train-doc-ost.submit
#-------------------------------------------------------------------
# models for the MeMAD project
#-------------------------------------------------------------------
memad2en:
${MAKE} LANGS="${MEMAD_LANGS}" PIVOT=en all2pivot
memad-fiensv:
${MAKE} SRCLANGS=sv TRGLANGS=fi traindata-spm
${MAKE} SRCLANGS=sv TRGLANGS=fi devdata-spm
${MAKE} SRCLANGS=sv TRGLANGS=fi wordalign-spm
${MAKE} SRCLANGS=sv TRGLANGS=fi WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
${MAKE} SRCLANGS=sv TRGLANGS=fi reverse-data-spm
${MAKE} SRCLANGS=fi TRGLANGS=sv WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
${MAKE} SRCLANGS=en TRGLANGS=fi traindata-spm
${MAKE} SRCLANGS=en TRGLANGS=fi devdata-spm
${MAKE} SRCLANGS=en TRGLANGS=fi wordalign-spm
${MAKE} SRCLANGS=en TRGLANGS=fi WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
${MAKE} SRCLANGS=en TRGLANGS=fi reverse-data-spm
${MAKE} SRCLANGS=fi TRGLANGS=en WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
memad250-fiensv:
${MAKE} CONTEXT_SIZE=250 memad-fiensv_doc
memad-fiensv_doc:
${MAKE} SRCLANGS=sv TRGLANGS=fi traindata-doc
${MAKE} SRCLANGS=sv TRGLANGS=fi devdata-doc
${MAKE} SRCLANGS=sv TRGLANGS=fi WALLTIME=72 HPC_MEM=8g MARIAN_WORKSPACE=20000 HPC_CORES=1 train-doc.submit-multigpu
${MAKE} SRCLANGS=sv TRGLANGS=fi reverse-data-doc
${MAKE} SRCLANGS=fi TRGLANGS=sv WALLTIME=72 HPC_MEM=8g MARIAN_WORKSPACE=20000 HPC_CORES=1 train-doc.submit-multigpu
${MAKE} SRCLANGS=en TRGLANGS=fi traindata-doc
${MAKE} SRCLANGS=en TRGLANGS=fi devdata-doc
${MAKE} SRCLANGS=en TRGLANGS=fi WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-doc.submit-multigpu
${MAKE} SRCLANGS=en TRGLANGS=fi reverse-data-doc
${MAKE} SRCLANGS=fi TRGLANGS=en WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-doc.submit-multigpu
memad-fiensv_more:
${MAKE} SRCLANGS=sv TRGLANGS=fi traindata-doc
${MAKE} SRCLANGS=sv TRGLANGS=fi devdata-doc
${MAKE} SRCLANGS=sv TRGLANGS=fi WALLTIME=72 HPC_MEM=8g MARIAN_WORKSPACE=20000 HPC_CORES=1 train-doc.submit-multigpu
${MAKE} SRCLANGS=sv TRGLANGS=fi reverse-data-doc
${MAKE} SRCLANGS=fi TRGLANGS=sv WALLTIME=72 HPC_MEM=8g MARIAN_WORKSPACE=20000 HPC_CORES=1 train-doc.submit-multigpu
${MAKE} CONTEXT_SIZE=500 memad-fiensv_doc
memad:
for s in fi en sv de fr nl; do \
for t in en fi sv de fr nl; do \
if [ "$$s" != "$$t" ]; then \
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/${DATASET}.*.valid${NR.log}; then\
${MAKE} SRCLANGS=$$s TRGLANGS=$$t bilingual-dynamic; \
fi \
fi \
done \
done
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t data; \
# ${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train.submit-multigpu; \
fiensv_bpe:
${MAKE} SRCLANGS=fi TRGLANGS=sv traindata-bpe
${MAKE} SRCLANGS=fi TRGLANGS=sv devdata-bpe
${MAKE} SRCLANGS=fi TRGLANGS=sv wordalign-bpe
${MAKE} SRCLANGS=fi TRGLANGS=sv WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-bpe.submit-multigpu
${MAKE} SRCLANGS=fi TRGLANGS=en traindata-bpe
${MAKE} SRCLANGS=fi TRGLANGS=en devdata-bpe
${MAKE} SRCLANGS=fi TRGLANGS=en wordalign-bpe
${MAKE} SRCLANGS=fi TRGLANGS=en WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-bpe.submit-multigpu
fiensv_spm:
${MAKE} SRCLANGS=fi TRGLANGS=sv traindata-spm
${MAKE} SRCLANGS=fi TRGLANGS=sv devdata-spm
${MAKE} SRCLANGS=fi TRGLANGS=sv wordalign-spm
${MAKE} SRCLANGS=fi TRGLANGS=sv WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
${MAKE} SRCLANGS=fi TRGLANGS=en traindata-spm
${MAKE} SRCLANGS=fi TRGLANGS=en devdata-spm
${MAKE} SRCLANGS=fi TRGLANGS=en wordalign-spm
${MAKE} SRCLANGS=fi TRGLANGS=en WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
fifr_spm:
${MAKE} SRCLANGS=fr TRGLANGS=fi traindata-spm
${MAKE} SRCLANGS=fr TRGLANGS=fi devdata-spm
${MAKE} SRCLANGS=fr TRGLANGS=fi wordalign-spm
${MAKE} SRCLANGS=fr TRGLANGS=fi WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
${MAKE} SRCLANGS=fr TRGLANGS=fi reverse-data-spm
${MAKE} SRCLANGS=fi TRGLANGS=fr WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
fifr_doc:
${MAKE} SRCLANGS=fr TRGLANGS=fi traindata-doc
${MAKE} SRCLANGS=fr TRGLANGS=fi devdata-doc
${MAKE} SRCLANGS=fr TRGLANGS=fi WALLTIME=72 HPC_MEM=8g MARIAN_WORKSPACE=20000 HPC_CORES=1 train-doc.submit-multigpu
${MAKE} SRCLANGS=fr TRGLANGS=fi reverse-data-doc
${MAKE} SRCLANGS=fi TRGLANGS=fr WALLTIME=72 HPC_MEM=8g MARIAN_WORKSPACE=20000 HPC_CORES=1 train-doc.submit-multigpu
fide_spm:
${MAKE} SRCLANGS=de TRGLANGS=fi traindata-spm
${MAKE} SRCLANGS=de TRGLANGS=fi devdata-spm
${MAKE} SRCLANGS=de TRGLANGS=fi wordalign-spm
${MAKE} SRCLANGS=de TRGLANGS=fi WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
${MAKE} SRCLANGS=de TRGLANGS=fi reverse-data-spm
${MAKE} SRCLANGS=fi TRGLANGS=de WALLTIME=72 HPC_MEM=4g HPC_CORES=1 train-spm.submit-multigpu
memad_spm:
for s in fi en sv de fr nl; do \
for t in en fi sv de fr nl; do \
if [ "$$s" != "$$t" ]; then \
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/*.valid${NR.log}; then\
${MAKE} SRCLANGS=$$s TRGLANGS=$$t traindata-spm; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t devdata-spm; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t wordalign-spm; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train-spm.submit-multigpu; \
fi \
fi \
done \
done
memad_doc:
for s in fi en sv; do \
for t in en fi sv; do \
if [ "$$s" != "$$t" ]; then \
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/*.valid${NR.log}; then\
${MAKE} SRCLANGS=$$s TRGLANGS=$$t traindata-doc; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t devdata-doc; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g MODELTYPE=transformer train-doc.submit-multigpu; \
fi \
fi \
done \
done
memad_docalign:
for s in fi en sv; do \
for t in en fi sv; do \
if [ "$$s" != "$$t" ]; then \
if ! grep -q 'stalled ${MARIAN_EARLY_STOPPING} times' ${WORKHOME}/$$s-$$t/*.valid${NR.log}; then\
${MAKE} SRCLANGS=$$s TRGLANGS=$$t traindata-doc; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t devdata-doc; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t HPC_CORES=1 HPC_MEM=4g train-doc.submit-multigpu; \
fi \
fi \
done \
done
enfisv:
${MAKE} SRCLANGS="en fi sv" TRGLANGS="en fi sv" traindata devdata wordalign
${MAKE} SRCLANGS="en fi sv" TRGLANGS="en fi sv" HPC_MEM=4g WALLTIME=72 HPC_CORES=1 train.submit-multigpu
en-fiet:
${MAKE} SRCLANGS="en" TRGLANGS="et fi" traindata devdata
${MAKE} SRCLANGS="en" TRGLANGS="et fi" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
${MAKE} TRGLANGS="en" SRCLANGS="et fi" traindata devdata
${MAKE} TRGLANGS="en" SRCLANGS="et fi" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
memad-multi:
for s in "${SCANDINAVIAN}" "en fr" "et hu fi" "${WESTGERMANIC}" "ca es fr ga it la oc pt_br pt"; do \
${MAKE} SRCLANGS="$$s" TRGLANGS="$$s" traindata devdata; \
${MAKE} SRCLANGS="$$s" TRGLANGS="$$s" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
done
for s in "${SCANDINAVIAN}" "en fr" "et hu fi" "${WESTGERMANIC}" "ca es fr ga it la oc pt_br pt"; do \
for t in "${SCANDINAVIAN}" "en fr" "et hu fi" "${WESTGERMANIC}" "ca es fr ga it la oc pt_br pt"; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRCLANGS="$$s" TRGLANGS="$$t" traindata devdata; \
${MAKE} SRCLANGS="$$s" TRGLANGS="$$t" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
fi \
done \
done
memad-multi2:
for s in "en fr" "et hu fi" "${WESTGERMANIC}" "ca es fr ga it la oc pt_br pt"; do \
for t in "${SCANDINAVIAN}" "en fr" "et hu fi" "${WESTGERMANIC}" "ca es fr ga it la oc pt_br pt"; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRCLANGS="$$s" TRGLANGS="$$t" traindata devdata; \
${MAKE} SRCLANGS="$$s" TRGLANGS="$$t" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
fi \
done \
done
memad-multi3:
for s in "${SCANDINAVIAN}" "${WESTGERMANIC}" "ca es fr ga it la oc pt_br pt"; do \
${MAKE} SRCLANGS="$$s" TRGLANGS="en" traindata devdata; \
${MAKE} SRCLANGS="$$s" TRGLANGS="en" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
${MAKE} SRCLANGS="en" TRGLANGS="$$s" traindata devdata; \
${MAKE} SRCLANGS="en" TRGLANGS="$$s" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
done
${MAKE} SRCLANGS="en" TRGLANGS="fr" traindata devdata
${MAKE} SRCLANGS="en" TRGLANGS="fr" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
${MAKE} SRCLANGS="fr" TRGLANGS="en" traindata devdata
${MAKE} SRCLANGS="fr" TRGLANGS="en" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
memad-fi:
for l in en sv de fr; do \
${MAKE} SRCLANGS=$$l TRGLANGS=fi traindata devdata; \
${MAKE} SRCLANGS=$$l TRGLANGS=fi HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
${MAKE} TRGLANGS=$$l SRCLANGS=fi traindata devdata; \
${MAKE} TRGLANGS=$$l SRCLANGS=fi HPC_MEM=4g HPC_CORES=1 train.submit-multigpu; \
done
nordic:
${MAKE} SRCLANGS="${SCANDINAVIAN}" TRGLANGS="${FINNO_UGRIC}" traindata
${MAKE} SRCLANGS="${SCANDINAVIAN}" TRGLANGS="${FINNO_UGRIC}" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
${MAKE} TRGLANGS="${SCANDINAVIAN}" SRCLANGS="${FINNO_UGRIC}" traindata
${MAKE} TRGLANGS="${SCANDINAVIAN}" SRCLANGS="${FINNO_UGRIC}" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
romance:
${MAKE} SRCLANGS="${ROMANCE}" TRGLANGS="${FINNO_UGRIC}" traindata
${MAKE} SRCLANGS="${ROMANCE}" TRGLANGS="${FINNO_UGRIC}" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
${MAKE} TRGLANGS="${ROMANCE}" SRCLANGS="${FINNO_UGRIC}" traindata
${MAKE} TRGLANGS="${ROMANCE}" SRCLANGS="${FINNO_UGRIC}" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
westgermanic:
${MAKE} SRCLANGS="${WESTGERMANIC}" TRGLANGS="${FINNO_UGRIC}" traindata
${MAKE} SRCLANGS="${WESTGERMANIC}" TRGLANGS="${FINNO_UGRIC}" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
${MAKE} TRGLANGS="${WESTGERMANIC}" SRCLANGS="${FINNO_UGRIC}" traindata
${MAKE} TRGLANGS="${WESTGERMANIC}" SRCLANGS="${FINNO_UGRIC}" HPC_MEM=4g HPC_CORES=1 train.submit-multigpu
germanic-romance:
${MAKE} SRCLANGS="${ROMANCE}" \
TRGLANGS="${GERMANIC}" traindata
${MAKE} HPC_MEM=4g HPC_CORES=1 SRCLANGS="${ROMANCE}" \
TRGLANGS="${GERMANIC}" train.submit-multigpu
${MAKE} TRGLANGS="${ROMANCE}" \
SRCLANGS="${GERMANIC}" traindata devdata
${MAKE} HPC_MEM=4g HPC_CORES=1 TRGLANGS="${ROMANCE}" \
SRCLANGS="${GERMANIC}" train.submit-multigpu
## fix a problem with missing links from reverse-data
## --> this caused that models with bt data used less data
## --> need to restart those!
fix-missing-val:
for f in `find work/ -type l -name '*.shuffled.gz' | grep -v old | sed 's/.src.shuffled.gz//'`; do \
if [ ! -e $$f.src.notused.gz ]; then \
echo "missing $$f.src.notused.gz!"; \
s=`echo $$f | cut -f2 -d'/' | cut -f1 -d'-'`; \
t=`echo $$f | cut -f2 -d'/' | cut -f2 -d'-'`; \
d=`echo $$f | cut -f4 -d'/'`; \
if [ -e work/$$t-$$s/val/$$d.trg.notused.gz ]; then \
echo "linking ${PWD}/work/$$t-$$s/val/$$d.trg.notused.gz"; \
ln -s ${PWD}/work/$$t-$$s/val/$$d.trg.notused.gz $$f.src.notused.gz; \
ln -s ${PWD}/work/$$t-$$s/val/$$d.src.notused.gz $$f.trg.notused.gz; \
if [ `ls work/$$s-$$t/opus*+bt*valid1.log 2>/dev/null | wc -l` -gt 0 ]; then \
echo "opus+bt model exists! move it away!"; \
mkdir work/$$s-$$t/old-bt-model; \
mv work/$$s-$$t/*+bt* work/$$s-$$t/old-bt-model/; \
fi; \
fi; \
fi \
done