Merge branch 'master' of github.com:Helsinki-NLP/OPUS-MT-train

This commit is contained in:
Joerg Tiedemann 2021-10-05 14:44:04 +03:00
commit 378eff0710
31 changed files with 1627 additions and 649 deletions

View File

@ -326,14 +326,16 @@ train-and-eval-job:
#------------------------------------------------------------------------
.PHONY: data
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz
${MAKE} ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
${MAKE} ${TEST_SRC}.${PRE_SRC} ${TEST_TRG}
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
ifeq (${MODELTYPE},transformer-align)
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
${MAKE} ${TRAIN_ALG}
endif
# ifeq (${MODELTYPE},transformer-align)
traindata: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz
testdata: ${TEST_SRC}.${PRE_SRC} ${TEST_TRG}

View File

@ -5,7 +5,7 @@ This package includes scripts for training NMT models using MarianNMT and OPUS d
## Pre-trained models
The subdirectory [models](https://github.com/Helsinki-NLP/Opus-MT-train/tree/master/models) contains information about pre-trained models that can be downloaded from this project. They are distribted with a [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/) license.
The subdirectory [models](https://github.com/Helsinki-NLP/Opus-MT-train/tree/master/models) contains information about pre-trained models that can be downloaded from this project. They are distribted with a [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/) license. [More pre-trained models](https://github.com/Helsinki-NLP/Tatoeba-Challenge/blob/master/results/tatoeba-results-all.md) trained with the [OPUS-MT training pipeline](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/doc/TatoebaChallenge.md) are available from the [Tatoeba translation challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge) also under a [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/) license.
## Quickstart

View File

@ -241,7 +241,7 @@ src2all:
# RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
fetch-bt:
@ -251,6 +251,13 @@ fetch-bt:
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
fetch-all-bt:
for d in ${RELEASED_BT_ALL}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
#---------------------------------------------------------------
# release data

View File

@ -112,8 +112,10 @@ MAX_OVER_SAMPLING ?= 50
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
SORTSRC = ${firstword ${SORTLANGS}}
SORTTRG = ${lastword ${SORTLANGS}}
LANGPAIR = ${SORTSRC}-${SORTTRG}
SPACE = $(empty) $(empty)
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
@ -128,11 +130,15 @@ LANGSTR ?= ${subst ${SPACE},+,$(LANGS)}
## for same language pairs: add numeric extension
## (this is neccessary to keep source and target files separate)
ifeq (${SRC},$(TRG))
SRCEXT = ${SRC}1
TRGEXT = ${SRC}2
SRCEXT = ${SRC}1
TRGEXT = ${SRC}2
SORTSRCEXT = ${SORTSRC}1
SORTTRGEXT = ${SORTSRC}2
else
SRCEXT = ${SRC}
TRGEXT = ${TRG}
SRCEXT = ${SRC}
TRGEXT = ${TRG}
SORTSRCEXT = ${SORTSRC}
SORTTRGEXT = ${SORTTRG}
endif
## set a flag to use target language labels
@ -315,7 +321,24 @@ PRE_TRG = ${SUBWORDS}${TRGBPESIZE:000=}k
## default name of the data set (and the model)
##-------------------------------------
DATASET ?= opus
TRAINSET_NAME ?= opus
DATASET ?= ${TRAINSET_NAME}
## dev and test data come from one specific data set
## if we have a bilingual model
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
DEVSET_NAME ?= ${DEVSET}
TESTSET_NAME ?= ${TESTSET}
endif
endif
## otherwise we give them a generic name
DEVSET_NAME ?= opus-dev
TESTSET_NAME ?= opus-test
## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training
@ -336,20 +359,6 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
## dev and test data come from one specific data set
## if we have a bilingual model
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
DEVSET_NAME ?= ${DEVSET}
TESTSET_NAME ?= ${TESTSET}
endif
endif
## otherwise we give them a generic name
DEVSET_NAME ?= opus-dev
TESTSET_NAME ?= opus-test
DEV_SRC ?= ${WORKDIR}/val/${DEVSET_NAME}.src
DEV_TRG ?= ${WORKDIR}/val/${DEVSET_NAME}.trg
@ -359,10 +368,15 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
MODEL_SUBDIR =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODELTYPE = transformer-align
# MODELTYPE = transformer
NR = 1
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODELTYPES = transformer \
transformer-big \
transformer-align \
transformer-big-align \
transformer-small-align \
transformer-tiny-align
MODELTYPE = transformer-align
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
@ -423,16 +437,20 @@ TEST_COMPARISON = ${TEST_TRANSLATION}.compare
## parameters for running Marian NMT
MARIAN_GPUS = 0
MARIAN_GPUS ?= 0
MARIAN_EXTRA =
MARIAN_VALID_FREQ = 10000
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_EARLY_STOPPING = 10
MARIAN_VALID_MINI_BATCH = 16
MARIAN_MAXI_BATCH = 500
MARIAN_DROPOUT = 0.1
MARIAN_MAX_LENGTH = 500
MARIAN_VALID_FREQ ?= 10000
MARIAN_SAVE_FREQ ?= ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ ?= ${MARIAN_VALID_FREQ}
MARIAN_EARLY_STOPPING ?= 10
MARIAN_VALID_MINI_BATCH ?= 16
MARIAN_MAXI_BATCH ?= 500
MARIAN_DROPOUT ?= 0.1
MARIAN_MAX_LENGTH ?= 500
MARIAN_ENC_DEPTH ?= 6
MARIAN_DEC_DEPTH ?= 6
MARIAN_ATT_HEADS ?= 8
MARIAN_DIM_EMB ?= 512
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
@ -440,7 +458,7 @@ MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} \
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
## TODO: currently marianNMT crashes with workspace > 26000

View File

@ -55,7 +55,8 @@ endif
## - use only the latest backtranslations
## if such a subdir exists
BACKTRANS_HOME = backtranslate
BACKTRANS_HOME = backtranslate
FORWARDTRANS_HOME = ${BACKTRANS_HOME}
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}/latest
@ -63,6 +64,13 @@ else
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}
endif
ifneq (${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest},)
FORWARDTRANS_DIR = ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest
else
FORWARDTRANS_DIR = ${FORWARDTRANS_HOME}/${SRC}-${TRG}
endif
## TODO: make it possible to select only parts of the BT data
## ---> use TRAINDATA_SIZE to take max the same amount of all shuffled BT data
@ -71,6 +79,11 @@ ifeq (${USE_BACKTRANS},1)
BACKTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${BACKTRANS_SRC}}
endif
ifeq (${USE_FORWARDTRANS},1)
FORWARDTRANS_SRC = ${sort ${wildcard ${FORWARDTRANS_DIR}/*.${SRCEXT}.gz}}
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
endif
ifeq (${USE_PIVOTING},1)
PIVOTING_SRC = ${sort ${wildcard pivoting/${SRC}-${TRG}/latest/*.${SRCEXT}.gz} \
${wildcard pivoting/${TRG}-${SRC}/latest/*.${SRCEXT}.gz}}
@ -83,7 +96,7 @@ endif
##-------------------------------------------------------------
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \
${BACKTRANS_SRC} ${PIVOTING_SRC}
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${PIVOTING_SRC}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}}
@ -409,7 +422,7 @@ ifdef SHUFFLE_DATA
endif
######################################
# FIT_DATA_SIZE is set?
# --> fit data to speciic size
# --> fit data to specific size
# --> under/over sampling!
######################################
@echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
@ -523,17 +536,17 @@ endif
@echo "" >> ${dir ${DEV_SRC}}/README.md
@echo -n "* devset-selected: top " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled" >> ${dir ${DEV_SRC}}/README.md
ifeq (${DEVSET},${TESTSET})
@echo -n "* testset-selected: next " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled " >> ${dir ${DEV_SRC}}/README.md
@echo "* devset-unused: added to traindata" >> ${dir ${DEV_SRC}}/README.md
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
@echo "" >> ${dir ${TEST_SRC}}/README.md
@echo -n "testset-selected: next " >> ${dir ${TEST_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
@echo " lines of ../val/${notdir $@}.shuffled" >> ${dir ${TEST_SRC}}/README.md
endif

View File

@ -34,6 +34,8 @@ get-model-distro = ${shell echo ${wildcard ${1}/${2}/*.zip} | tr ' ' "\n" | LAN
find-model:
@echo ${call get-model-dist,${LANGPAIRSTR}}
@ -42,7 +44,12 @@ find-model:
MIN_BLEU_SCORE = 20
.PHONY: dist local-dist global-dist release
dist: ${DIST_PACKAGE}
## create a symbolic link to the latest model
## and make the package
dist:
${MAKE} link-latest-model
${MAKE} ${DIST_PACKAGE}
## local distribution in workhome, no restrictions about BLEU
local-dist:
@ -206,9 +213,9 @@ RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}
## advantage: list all labels that are valid in the model
## disadvantage: can be misleading because we may have labels that are not trained
##
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
LANGUAGELABELSUSED = $(filter ${TRGLANGS},${LANGUAGELABELSRAW})
model-yml: ${MODEL_YML}
@ -249,24 +256,15 @@ ${MODEL_YML}: ${MODEL_FINAL}
@echo "dataset-name: $(DATASET)" >> $@
@echo "modeltype: $(MODELTYPE)" >> $@
@echo "vocabulary:" >> $@
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
@echo "subwords:" >> $@
@echo " source: ${PRE_SRC}" >> $@
@echo " target: ${PRE_TRG}" >> $@
@echo " source: ${PRE_SRC}" >> $@
@echo " target: ${PRE_TRG}" >> $@
@echo "subword-models:" >> $@
@echo " source: source.${SUBWORD_TYPE}" >> $@
@echo " target: target.${SUBWORD_TYPE}" >> $@
ifdef USE_TARGET_LABELS
@echo "use-target-labels:" >> $@
@for t in ${LANGUAGELABELSRAW}; do \
echo " - \">>$$t<<\"" >> $@; \
done
# @for t in ${TRGLANGS}; do \
# echo " - '>>$$t<<'" >> $@; \
# done
endif
@echo " source: source.${SUBWORD_TYPE}" >> $@
@echo " target: target.${SUBWORD_TYPE}" >> $@
@echo "source-languages:" >> $@
@for s in ${RAWSRCLANGS}; do\
echo " - $$s" >> $@; \
@ -275,17 +273,26 @@ endif
@for t in ${RAWTRGLANGS}; do\
echo " - $$t" >> $@; \
done
ifdef USE_TARGET_LABELS
@echo "use-target-labels:" >> $@
@for t in ${LANGUAGELABELSUSED}; do \
echo " - \">>$$t<<\"" >> $@; \
done
endif
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
@echo "training-data:" >> $@
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
tr "#" "\n" | grep '^ ${DATASET}~' | \
tail -1 | tr "~" "\n" | grep '^\* ' | \
grep -v ': *$$' | grep -v ' 0$$' | \
grep -v 'unused dev/test' | \
grep -v 'total size' | sed 's/^\* / /' >> $@
endif
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
@echo "validation-data:" >> $@
grep '^\* ' ${WORKDIR}/val/README.md | \
sed 's/total size of shuffled dev data:/total-size-shuffled:/' | \
sed 's/devset =/devset-selected:/' | \
grep -v ' 0$$' | \
sed 's/^\* / /' >> $@
endif
@ -390,7 +397,12 @@ endif
link-latest-model:
if [ `ls ${patsubst %.zip,%-*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
cd ${dir ${DIST_PACKAGE}}; \
ln -s `ls -t ${patsubst %.zip,%-*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
${notdir ${DIST_PACKAGE}}; \
fi
${DIST_PACKAGE}: ${MODEL_FINAL}
@ -505,182 +517,6 @@ endif
##### ------------------------------------
##### OLD release recipe: all in one
##### ------------------------------------
# ${DIST_PACKAGE}: ${MODEL_FINAL}
# ifneq (${SKIP_DIST_EVAL},1)
# @${MAKE} $(TEST_EVALUATION)
# @${MAKE} $(TEST_COMPARISON)
# endif
# @mkdir -p ${dir $@}
# @touch ${WORKDIR}/source.tcmodel
# @cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
# @cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
# @cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
# @cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
# ##-----------------------------
# ## create YML file
# ##-----------------------------
# @echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
# @echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
# @echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
# @echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
# @echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
# @echo "subwords:" >> ${@:.zip=}-${DATE}.yml
# @echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
# @echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
# @echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
# @echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
# @echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
# ifdef USE_TARGET_LABELS
# @echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
# @for t in ${TRGLANGS}; do \
# echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
# done
# endif
# @echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
# @for s in ${RAWSRCLANGS}; do\
# echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
# done
# @echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
# @for t in ${RAWTRGLANGS}; do\
# echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
# done
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
# @echo "training-data:" >> ${@:.zip=}-${DATE}.yml
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
# tr "#" "\n" | grep '^ ${DATASET}~' | \
# tail -1 | tr "~" "\n" | grep '^\* ' | \
# grep -v ': *$$' | grep -v ' 0$$' | \
# grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
# endif
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
# @echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
# grep '^\* ' ${WORKDIR}/val/README.md | \
# grep -v ' 0$$' | \
# sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
# endif
# ##-----------------------------
# ## create README-file
# ##-----------------------------
# @echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# @echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
# @echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
# @echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
# @echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
# ifdef USE_TARGET_LABELS
# echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
# @echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
# endif
# @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
# ifneq (${SKIP_DATA_DETAILS},1)
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
# @echo -n "## Training data: " >> ${WORKDIR}/README.md
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
# tr "#" "\n" | grep '${DATASET}' | \
# tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# endif
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
# @echo -n "#" >> ${WORKDIR}/README.md
# @cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# endif
# endif
# ##-----------------------------
# ## add benchmark results
# ##-----------------------------
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
# @echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
# @echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# @echo '## Benchmarks' >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# ## grep and normalise test set names
# ## ugly perl script that does some tansformation of language codes
# @grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
# perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f3 -d ' ' > $@.2
# @grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f3 -d ' ' > $@.3
# @ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f16 -d ' ' | sed 's/)//' > $@.5
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f7 -d ' ' > $@.6
# @paste -d '/' $@.4 $@.5 > $@.7
# @echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
# @echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
# @paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
# sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
# sort | uniq >> ${WORKDIR}/README.md
# @echo "test-data:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
# @echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
# @echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
# @rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
# endif
# ##-----------------------------
# ## create the package
# ##-----------------------------
# @cat ${WORKDIR}/README.md >> ${dir $@}README.md
# @echo '' >> ${dir $@}README.md
# @cp models/LICENSE ${WORKDIR}/
# @chmod +x ${WORKDIR}/preprocess.sh
# @sed -e 's# - .*/\([^/]*\)$$# - \1#' \
# -e 's/beam-size: [0-9]*$$/beam-size: 6/' \
# -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
# -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
# -e 's/relative-paths: false/relative-paths: true/' \
# < ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
# cd ${WORKDIR} && zip ${notdir $@} \
# README.md LICENSE \
# ${notdir ${MODEL_FINAL}} \
# ${notdir ${MODEL_SRCVOCAB}} \
# ${notdir ${MODEL_TRGVOCAB}} \
# ${notdir ${MODEL_VALIDLOG}} \
# ${notdir ${MODEL_TRAINLOG}} \
# source.* target.* decoder.yml \
# preprocess.sh postprocess.sh
# ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
# @cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
# endif
# ##-----------------------------
# ## move files to release dir and cleanup
# ##-----------------------------
# @mkdir -p ${dir $@}
# @mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
# @cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
# @cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
# @cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
# endif
# @rm -f $@
# @cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
# @rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
# @rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
## do this only if the flag is set
## --> avoid expensive wildcard searches each time make is called
@ -695,6 +531,7 @@ endif
# source project_2000661-openrc.sh
#
# - make upload ......... released models = all sub-dirs in models/
# - make upload-model ... upload model for current language pair
# - make upload-models .. trained models in current WORKHOME to OPUS-MT-dev
# - make upload-scores .. score file with benchmark results to OPUS-MT-eval
# - make upload-eval .... benchmark tests from models in WORKHOME
@ -712,6 +549,17 @@ upload:
swift upload ${MODEL_CONTAINER} index.txt
rm -f index.txt
.PHONY: upload-model
upload-model:
find ${RELEASEDIR}/ -type l | tar -cf models-links.tar -T -
find ${RELEASEDIR}/ -type l -delete
cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical ${LANGPAIRSTR}
tar -xf models-links.tar
rm -f models-links.tar
swift post ${MODEL_CONTAINER} --read-acl ".r:*"
swift list ${MODEL_CONTAINER} > index.txt
swift upload ${MODEL_CONTAINER} index.txt
rm -f index.txt
.PHONY: upload-models
upload-models:
@ -968,7 +816,7 @@ dist-remove-no-date-dist:
dist-remove-old-yml:
swift list Tatoeba-MT-models > index.txt
for d in `grep old-yml index.txt`; do \
for d in `grep yml-old index.txt`; do \
swift delete Tatoeba-MT-models $$d; \
done
@ -993,3 +841,21 @@ dist-fix-preprocess:
rm -f $$d; \
done )
## fix yet another error in YAML files
# YMLFILES = ${wildcard models-tatoeba/eng-*/*-2021-04-10.yml}
# OLDYMLFILES = ${patsubst %.yml,%.yml-old,${YMLFILES}}
# ${OLDYMLFILES}: %.yml-old: %.yml
# mv $< $@
# sed -e 's/devset =/devset-selected:/' \
# -e 's/testset =/testset-selected:/' \
# -e 's/total size of shuffled dev data:/total-size-shuffled:/' < $@ |\
# grep -v 'unused dev/test' > $<
# touch $@
# fix-yml-files: ${OLDYMLFILES}

View File

@ -68,8 +68,8 @@ WORKHOME = ${PWD}/work
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
# CSCPROJECT = project_2002688
CSCPROJECT = project_2003093
CSCPROJECT = project_2002688
# CSCPROJECT = project_2003093
# CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
APPLHOME = /projappl/project_2003093/
@ -121,7 +121,8 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
LOADGPU = module load ${GPU_MODULES}
LOADMODS = ${LOADGPU}
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
CSCPROJECT = project_2002688
# CSCPROJECT = project_2002688
CSCPROJECT = project_2000309
# CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
APPLHOME = /projappl/project_2001194
@ -182,7 +183,6 @@ MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
## BPE
SUBWORD_BPE ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
SUBWORD_HOME ?= ${dir ${SUBWORD_BPE}}
@ -285,7 +285,7 @@ ${TOOLSDIR}/jq/jq:
${TOOLSDIR}/marian-dev/build/marian: ${PROTOC}
mkdir -p ${dir $@}
cd ${dir $@} && cmake -DUSE_SENTENCEPIECE=on ${MARIAN_BUILD_OPTIONS} ..
${MAKE} -C ${dir $@} -j
${MAKE} -C ${dir $@} -j8
${TOOLSDIR}/protobuf/bin/protoc:
cd tools && git clone https://github.com/protocolbuffers/protobuf.git

View File

@ -266,8 +266,64 @@ endif
MARIAN_EARLY_STOPPING=${BT_MARIAN_EARLY_STOPPING} \
${@:-bt=}
# CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
# CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
## add forward translations
FT_MODEL = ${MODEL_SUBDIR}${DATASET}+ft${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
FT_MODEL_BASE = ${FT_MODEL}.${MODELTYPE}.model${NR}
FT_MODEL_START = ${WORKDIR}/${FT_MODEL_BASE}.npz
FT_MODEL_VOCAB = ${WORKDIR}/${FT_MODEL}.vocab.yml
FT_MARIAN_EARLY_STOPPING = 15
%-ft:
ifneq (${wildcard ${MODEL_FINAL}},)
ifeq (${wildcard ${FT_MODEL_START}},)
cp ${MODEL_FINAL} ${FT_MODEL_START}
cp ${MODEL_VOCAB} ${FT_MODEL_VOCAB}
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+ft \
USE_FORWARDTRANS=1 \
CONTINUE_EXISTING=1 \
MODELCONFIG=config-ft.mk \
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
${@:-ft=}
## train on back-translations only
%-btonly:
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+btonly \
USE_BACKTRANS=1 \
MODELCONFIG=config-bt.mk \
TRAINSET= TATOEBA_TRAINSET= \
${@:-btonly=}
## train on forward-translations only
%-ftonly:
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+ftonly \
USE_FORWARDTRANS=1 \
MODELCONFIG=config-ft.mk \
TRAINSET= TATOEBA_TRAINSET= \
${@:-ftonly=}
## only forward translated training data
## (for knowledge distillation)
## TODO: better have a separate set for those data sets
## to make it possible to combine with other forward translations
%-ft-train-only:
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+ft-train-only \
USE_FORWARDTRANS=1 \
FORWARDTRANS_HOME=${PWD}/ft-tatoeba \
MODELCONFIG=config-ft-train.mk \
TRAINSET= TATOEBA_TRAINSET= \
${@:-ft-train-only=}

View File

@ -13,12 +13,18 @@
## - should we increase the length filter when cleaning later? How much?
## - should we apply some other cleanup scripts here to get rid of some messy stuff?
## should we remove zero-width spaces?
## perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
cat ${word 1,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.1
cat ${word 2,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.2
paste $@.1 $@.2 |\
scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext

View File

@ -50,6 +50,12 @@ fi-zh:
train-dynamic.submitcpu
# Tatoeba: more than 100 test sentences:
# ain dan deu eng enm epo est fkv fra heb hun ita jpn kor kur lat lit nld nor pol por rus spa swe tur zho
#-------------------------------------------------------------------
# add THL backtranslation data (and also all other backtranslations)
#-------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -9,6 +9,11 @@ as-en:
${MAKE} reverse-data-as-en
${MAKE} train-dynamic-en-as
BCL_DEVSIZE = 1000
BCL_TESTSIZE = 1000
en-bcl:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia all-job
@ -29,25 +34,31 @@ bcl-en-nt:
DEVSET=wikimedia all-job
%-en-bcl:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DEVSET=wikimedia \
DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
USE_REST_DEVDATA=1 ${@:-en-bcl=}
%-bcl-en:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-bcl-en=}
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DEVSET=wikimedia \
DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
USE_REST_DEVDATA=1 ${@:-bcl-en=}
%-en-bcl-nt:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia \
DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
${@:-en-bcl-nt=}
%-bcl-en-nt:
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia \
DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
${@:-bcl-en-nt=}
@ -72,34 +83,228 @@ ENBCL_BPE = 1000
#-----------------------------------------------------------------------------
# start jobs for all languages where we have back-translations into English
#-----------------------------------------------------------------------------
## languages for which we have back translated wiki data into English
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul bos_Latn cmn_Hans cmn_Hant hrv ind nno nob srp_Cyrl srp_Latn
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul
## start jobs for all languages where we have back-translations
## start jobs for all languages where we have back-translations into English
wiki-eng2all-with-bt:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
echo "fetch back-translations for $$l-eng"; \
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
echo "fetch $$l wiki backtranslations"; \
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
done
for l in ${sort ${shell iso639 -m -n ${WIKI_BT2ENG}}}; do \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
done
# for l in ${WIKI_BT2ENG}; do \
# if [ -d work-tatoeba/$$l-eng ]; then \
# if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
# echo "fetch back-translations for $$l-eng"; \
# ${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
# echo "start training eng-$$l with backtranslation data"; \
# ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
# fi \
# fi \
# done
wiki-eng2all-with-bt-continue:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-train-bt; \
fi \
fi \
done
WIKI_BT2ENG_PARENTS = ${sort ${shell langgroup -p ${WIKI_BT2ENG}}}
wiki-eng2all-with-bt-eval:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=4 tatoeba-eng2$$l-evalall-bt.submit; \
fi \
fi \
done
wiki-eng2allgroups-with-bt:
for l in $(filter-out roa,${WIKI_BT2ENG_PARENTS}); do \
# if [ `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-eng2all-with-bt-dist:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
echo "mv work-tatoeba/eng-$$l work-tatoeba-old"; \
mv work-tatoeba/eng-$$l work-tatoeba-old; \
fi; \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt; \
fi \
fi \
done
#-----------------------------------------------------------------------------
# models for translating English into language groups with backtranslations
# (does not fetch back-translations - they need to be available in bt-tatoeba!)
#-----------------------------------------------------------------------------
WIKI_BT2ENG_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_BT2ENG} | xargs langgroup -p}}
wiki-eng2allgroups-with-bt:
for l in ${WIKI_BT2ENG_PARENTS}; do \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
done
wiki-eng2allgroups-with-bt-continue:
for l in ${WIKI_BT2ENG_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-train-bt-1m; \
fi \
fi \
done
wiki-eng2allgroups-with-bt-eval:
for l in ${WIKI_BT2ENG_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=8 tatoeba-eng2$$l-evalall-bt-1m.submit; \
fi \
fi \
done
# if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-eng2allgroups-with-bt-dist:
for l in ${WIKI_BT2ENG_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt-1m; \
fi \
fi \
done
#-----------------------------------------------------------------------------
# start jobs for all languages where we have back-translations from English
#-----------------------------------------------------------------------------
## languages for which we have back translated wiki data from English
WIKI_ENG2BT = afr ara aze bel ben bos_Latn bre bul cat ceb ces cmn_Hans cmn_Hant cym dan deu ell epo est eus fao fin fra fry gle glg heb hin hrv hun hye ido ilo ina ind isl ita lav lit ltz mal mar mkd mlt msa nds nld nno nob pol por ron run rus spa sqi srp_Cyrl srp_Latn swa swe tam tgl tha tur ukr urd uzb_Latn vie war zho zsm_Latn
wiki-all2eng-with-bt:
for l in ${WIKI_ENG2BT}; do \
echo "fetch $$l wiki backtranslations"; \
${MAKE} -C bt-tatoeba TRG=$$l SRC=eng fetch-bt; \
done
for l in ${sort ${shell iso639 -m -n ${WIKI_ENG2BT}}}; do \
if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
else \
echo "start training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt.submitcpu; \
fi \
fi \
done
wiki-all2eng-with-bt-continue:
for l in ${WIKI_ENG2BT}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
fi \
fi \
done
wiki-all2eng-with-bt-eval:
for l in ${WIKI_ENG2BT}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt.submit; \
fi \
fi \
done
# if [ `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-all2eng-with-bt-dist:
for l in ${WIKI_ENG2BT}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt; \
fi \
fi \
done
WIKI_ENG2BT_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_ENG2BT} | xargs langgroup -p}}
wiki-allgroups2eng-with-bt:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
else \
echo "start training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt-1m.submitcpu; \
fi \
fi \
done
wiki-allgroups2eng-with-bt-continue:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
fi \
fi \
done
wiki-allgroups2eng-with-bt-eval:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt-1m.submit; \
fi \
fi \
done
# if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-allgroups2eng-with-bt-dist:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt-1m; \
fi \
fi \
done

View File

@ -13,7 +13,9 @@ ifeq (${SUBWORDS},spm)
${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL}
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
else
cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab}
cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab}
@ -39,7 +41,9 @@ ${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
ifeq ($(wildcard ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
else
mkdir -p ${dir $@}
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
@ -88,8 +92,7 @@ endif
## possible model variants
MARIAN_MODELS_DONE = ${WORKDIR}/${MODEL}.transformer.model${NR}.done \
${WORKDIR}/${MODEL}.transformer-align.model${NR}.done
MARIAN_MODELS_DONE = ${patsubst %,${WORKDIR}/${MODEL}.%.model${NR}.done,${MODELTYPES}}
MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
@ -138,6 +141,40 @@ ifeq (${MODELTYPE},transformer-align)
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
endif
ifeq (${MODELTYPE},transformer-small-align)
MARIAN_ENC_DEPTH = 6
MARIAN_DEC_DEPTH = 2
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 512
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG} --transformer-decoder-autoreg rnn --dec-cell ssru
endif
ifeq (${MODELTYPE},transformer-tiny-align)
MARIAN_ENC_DEPTH = 3
MARIAN_DEC_DEPTH = 2
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 256
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG} --transformer-decoder-autoreg rnn --dec-cell ssru
endif
ifeq (${MODELTYPE},transformer-big-align)
MARIAN_ENC_DEPTH = 12
MARIAN_ATT_HEADS = 16
MARIAN_DIM_EMB = 1024
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
GPUJOB_HPC_MEM = 16g
endif
ifeq (${MODELTYPE},transformer-big)
MARIAN_ENC_DEPTH = 12
MARIAN_ATT_HEADS = 16
MARIAN_DIM_EMB = 1024
GPUJOB_HPC_MEM = 16g
endif
## finally: recipe for training transformer model
@ -151,19 +188,22 @@ ${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
ifeq (${wildcard ${MODEL_START}},)
ifneq (${MODEL_LATEST},)
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
ifneq (${MODEL_LATEST},${MODEL_START})
cp ${MODEL_LATEST} ${MODEL_START}
endif
endif
endif
endif
##--------------------------------------------------------------------
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
${MARIAN_STOP_CRITERIA} \
--model $(@:.done=.npz) \
--type transformer \
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
--max-length 500 \
--max-length ${MARIAN_MAX_LENGTH} \
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
--mini-batch-fit \
-w ${MARIAN_WORKSPACE} \
@ -171,8 +211,11 @@ endif
--save-freq ${MARIAN_SAVE_FREQ} \
--disp-freq ${MARIAN_DISP_FREQ} \
--log $(@:.model${NR}.done=.train${NR}.log) \
--enc-depth 6 --dec-depth 6 \
--transformer-heads 8 \
--type transformer \
--enc-depth ${MARIAN_ENC_DEPTH} \
--dec-depth ${MARIAN_DEC_DEPTH} \
--dim-emb ${MARIAN_DIM_EMB} \
--transformer-heads ${MARIAN_ATT_HEADS} \
--transformer-postprocess-emb d \
--transformer-postprocess dan \
--transformer-dropout ${MARIAN_DROPOUT} \

View File

@ -1,3 +1,8 @@
#
# opus-2020-01-20.zip
@ -15,6 +20,11 @@
| JW300.bcl.en | 56.8 | 0.705 |
# opus-2020-02-11.zip
* dataset: opus
@ -31,6 +41,11 @@
| JW300.bcl.en | 56.1 | 0.697 |
# opus+bt-2020-05-23.zip
* dataset: opus+bt
@ -58,6 +73,11 @@
| JW300.bcl.en | 57.6 | 0.712 |
# opus+nt-2021-03-29.zip
* dataset: opus+nt
@ -92,3 +112,190 @@
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 10.4 | 0.320 | 525 | 27109 | 0.477 |
# opus+nt+bt-2021-04-01.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.zip)
## Training data: opus+nt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikinews.aa.en-bcl (357946)
* bcl-en: total size = 1809858
* total size (opus+nt+bt): 1809767
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.test.txt)
* test set scores: [opus+nt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 28.2 | 0.498 | 525 | 27109 | 0.799 |
# opus+nt+bt+bt-2021-04-03.zip
* dataset: opus+nt+bt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.zip)
## Training data: opus+nt+bt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* total size (opus+nt+bt+bt): 4730231
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.test.txt)
* test set scores: [opus+nt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 16.2 | 0.461 | 525 | 27109 | 1.000 |
# opus+nt+bt+bt+bt-2021-04-05.zip
* dataset: opus+nt+bt+bt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt+bt+bt-2021-04-05.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.zip)
## Training data: opus+nt+bt+bt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* total size (opus+nt+bt+bt+bt): 4730224
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt+bt-2021-04-05.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.test.txt)
* test set scores: [opus+nt+bt+bt+bt-2021-04-05.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 24.2 | 0.497 | 525 | 27109 | 1.000 |
# opus+nt+bt-2021-04-09.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt-2021-04-09.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.zip)
## Training data: opus+nt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* unused dev/test data is added to training data
* total size (opus+nt+bt): 4731419
## Validation data
* bcl-en: wikimedia, 2767
* total-size-shuffled: 1966
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 500 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-09.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.test.txt)
* test set scores: [opus+nt+bt-2021-04-09.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 33.5 | 0.562 | 500 | 28621 | 0.868 |
# opus+nt+bt-2021-04-12.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt-2021-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.zip)
## Training data: opus+nt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* unused dev/test data is added to training data
* total size (opus+nt+bt): 4732437
## Validation data
* bcl-en: wikimedia, 5033
* total-size-shuffled: 4207
* devset-selected: top 1000 lines of wikimedia.src.shuffled!
* testset-selected: next 1000 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.test.txt)
* test set scores: [opus+nt+bt-2021-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 31.5 | 0.523 | 1000 | 31520 | 0.836 |

View File

@ -1,86 +1,261 @@
# wikimedia-2020-01-17.zip
# opus+nt+bt-2021-03-30.zip
* dataset: wikimedia
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [wikimedia-2020-01-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.zip)
* test set translations: [wikimedia-2020-01-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.test.txt)
* test set scores: [wikimedia-2020-01-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 55.3 | 0.729 |
# opus-2020-01-20.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-01-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.zip)
* test set translations: [opus-2020-01-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.test.txt)
* test set scores: [opus-2020-01-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 55.3 | 0.729 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 53.8 | 0.719 |
# opus+bt-2020-02-26.zip
* dataset: opus+bt
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus+bt-2020-02-26.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.zip)
* test set translations: [opus+bt-2020-02-26.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.test.txt)
* test set scores: [opus+bt-2020-02-26.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 54.3 | 0.722 |
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.eval.txt)
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt-2021-03-30.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.zip)
## Training data: opus+nt+bt
## Training data: opus+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (43432)
* en-bcl: total size = 525523
* total size (opus+nt+bt): 525475
* en-bcl: wikimedia (1106)
* en-bcl: total size = 1106
* unused dev/test data is added to training data
* total size (opus+bt): 458304
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-03-30.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.test.txt)
* test set scores: [opus+nt+bt-2021-03-30.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 55.7 | 0.736 |
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 17.3 | 0.426 | 525 | 28399 | 0.840 |
# opus+nt+bt+bt-2021-04-01.zip
* dataset: opus+nt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.zip)
## Training data: opus+nt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474)
* en-bcl: total size = 527565
* total size (opus+nt+bt+bt): 527524
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.test.txt)
* test set scores: [opus+nt+bt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 21.6 | 0.476 | 525 | 28399 | 0.789 |
# opus+nt+bt+bt+bt-2021-04-03.zip
* dataset: opus+nt+bt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.zip)
## Training data: opus+nt+bt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474)
* en-bcl: total size = 527565
* total size (opus+nt+bt+bt+bt): 527496
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.test.txt)
* test set scores: [opus+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 22.7 | 0.482 | 525 | 28399 | 0.895 |
# opus2+nt+bt+bt+bt-2021-04-03.zip
* dataset: opus2+nt+bt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus2+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.zip)
## Training data: opus2+nt+bt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 573039
* total size (opus2+nt+bt+bt+bt): 572969
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus2+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.test.txt)
* test set scores: [opus2+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 23.9 | 0.497 | 525 | 28399 | 0.820 |
# opus+nt+bt+bt+bt+bt-2021-04-06.zip
* dataset: opus+nt+bt+bt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt+bt+bt-2021-04-06.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.zip)
## Training data: opus+nt+bt+bt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 618513
* total size (opus+nt+bt+bt+bt+bt): 618427
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt+bt+bt-2021-04-06.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.test.txt)
* test set scores: [opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 24.4 | 0.498 | 525 | 28399 | 0.805 |
# opus+nt+bt+bt-2021-04-10.zip
* dataset: opus+nt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt-2021-04-10.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.zip)
## Training data: opus+nt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 664007
* unused dev/test data is added to training data
* total size (opus+nt+bt+bt): 665111
## Validation data
* bcl-en: wikimedia, 2767
* total-size-shuffled: 1966
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 500 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt-2021-04-10.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.test.txt)
* test set scores: [opus+nt+bt+bt-2021-04-10.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 30.7 | 0.572 | 500 | 29131 | 0.921 |
# opus+nt+bt-2021-04-11.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt-2021-04-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.zip)
## Training data: opus+nt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 664007
* unused dev/test data is added to training data
* total size (opus+nt+bt): 666118
## Validation data
* bcl-en: wikimedia, 5033
* total-size-shuffled: 4207
* devset-selected: top 1000 lines of wikimedia.src.shuffled!
* testset-selected: next 1000 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.test.txt)
* test set scores: [opus+nt+bt-2021-04-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 31.9 | 0.585 | 1000 | 27681 | 1.000 |

124
scripts/evaluate/check-overlap.pl Executable file
View File

@ -0,0 +1,124 @@
#!/usr/bin/env perl
use utf8;
use strict;
use open qw/:std :utf8/;
use Getopt::Long;
my $AlphaOnly = 0;
my $LowerCase = 0;
my $DecodeSpm = 0;
my $verbose = 0;
GetOptions(
"alpha|a" => \$AlphaOnly,
"lower-case|l" => \$LowerCase,
"decode-spm|d" => \$DecodeSpm,
"verbose|v" => \$verbose );
my $BigSrcFile = shift(@ARGV);
my $BigTrgFile = shift(@ARGV);
my %SrcSents = ();
my %TrgSents = ();
my %SentPairs = ();
while (@ARGV){
my $SrcFile = shift(@ARGV);
my $TrgFile = shift(@ARGV);
read_pairs($SrcFile,$TrgFile);
}
my $S = open_file($BigSrcFile);
my $T = open_file($BigTrgFile);
my $total = 0;
my ($SrcExists,$TrgExists,$PairExists) = (0,0,0);
my %SrcUniqueExists = ();
my %TrgUniqueExists = ();
my %PairUniqueExists = ();
while (<$S>){
my $trg = <$T>;
&normalise($_);
&normalise($trg);
$total++;
if (exists $SrcSents{$_}){
$SrcExists++;
$SrcUniqueExists{$_}++;
}
if (exists $TrgSents{$trg}){
$TrgExists++;
$TrgUniqueExists{$trg}++;
}
if (exists $SentPairs{"$_\t$trg"}){
$PairExists++;
chomp;
unless (exists $PairUniqueExists{"$_\t$trg"}){
print STDERR "exists: $_\t$trg\n" if ($verbose);
$PairUniqueExists{"$_\t$trg"}++;
}
}
}
my $TotalSmall = scalar keys %SentPairs;
if ($total){
printf "source sentences from train found in devtest\t%d\t%5.2f\%\n",$SrcExists,100*$SrcExists/$total;
printf "target sentences from train found in devtest\t%d\t%5.2f\%\n",$TrgExists,100*$TrgExists/$total;
printf " sentence pairs from train found in devtest\t%d\t%5.2f\%\n",$PairExists,100*$PairExists/$total;
print "total size of training data\t",$total,"\n";
}
if ($TotalSmall){
my $SrcExistsSmall = scalar keys %SrcUniqueExists;
my $TrgExistsSmall = scalar keys %TrgUniqueExists;
my $PairExistsSmall = scalar keys %PairUniqueExists;
printf "source sentences from devtest found in train\t%d\t%5.2f\%\n",$SrcExistsSmall,100*$SrcExistsSmall/$TotalSmall;
printf "target sentences from devtest found in train\t%d\t%5.2f\%\n",$TrgExistsSmall,100*$TrgExistsSmall/$TotalSmall;
printf " sentence pairs from devtest found in train\t%d\t%5.2f\%\n",$PairExistsSmall,100*$PairExistsSmall/$TotalSmall;
print "total size of devtest data\t",$TotalSmall,"\n";
}
sub read_pairs{
my ($SrcFile,$TrgFile) = @_;
my $S = open_file($SrcFile);
my $T = open_file($TrgFile);
while (<$S>){
my $trg = <$T>;
&normalise($_);
&normalise($trg);
$SrcSents{$_} = 1;
$TrgSents{$trg} = 1;
$SentPairs{"$_\t$trg"} = 1;
}
close $S;
close $T;
}
sub open_file{
my $file = shift;
my $handle;
if ($file=~/\.gz$/){
open $handle,"gzip -cd <$file |" || die "cannot open $file\n";
return $handle;
}
open $handle,"<$file" || die "cannot open $file\n";
return $handle;
}
sub normalise{
$_[0]=~s/\P{IsAlpha}//gs if ($AlphaOnly);
$_[0] = lc($_[0]) if ($LowerCase);
if ($DecodeSpm){
if ($_[0]=~s/▁/ /g){
$_[0]=~s/ //g;
}
}
}

View File

@ -0,0 +1,31 @@
my %basemodel = ();
my %btmodel = ();
while (<>){
chomp;
s/https:\/\/object\.pouta\.csc\.fi\/Tatoeba\-MT\-models\///;
my @fields = split(/\t/);
if ($fields[3]=~/^(.*)\+bt-....-..-..\.zip/){
unless (exists $btmodel{"$fields[0]\t$1"}){
$btmodel{"$fields[0]\t$1"} = $_;
}
}
elsif ($fields[3]=~/^(.*)-....-..-..\.zip/){
unless (exists $basemodel{"$fields[0]\t$1"}){
$basemodel{"$fields[0]\t$1"} = $_;
}
}
}
foreach (sort keys %btmodel){
if (exists $basemodel{$_} and $btmodel{$_}){
print "base\t", $basemodel{$_},"\n";
print "base+bt\t", $btmodel{$_},"\n";
my @base = split(/\t/,$basemodel{$_});
my @bt = split(/\t/,$btmodel{$_});
$bt[1] = sprintf("%5.3f",$bt[1] - $base[1]);
$bt[2] = sprintf("%5.2f",$bt[2] - $base[2]);
print "diff\t", join("\t",@bt),"\n\n";
}
}

54
scripts/filter/filter-korean.sh Executable file
View File

@ -0,0 +1,54 @@
#!/usr/bin/bash
#
# extra filtering for Korean data
# filter out data that has characters other than Hang
#
# USAGE: filter-korean.sh srclangid trglangid < tab-sepatared-bitext > filtered-bitext
#
tmpsrc=`mktemp`
tmptrg=`mktemp`
tmplang=`mktemp`
column=0
if [ "$1" == "kor" ] || [ "$1" == "ko" ]; then
column=1
elif [ "$2" == "kor" ] || [ "$2" == "ko" ]; then
column=2
fi
## don't touch test sets
if [ "$3" == "test" ]; then
column=0
fi
if [ $column -gt 0 ]; then
echo "... filter Korean bitexts" >&2
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
else
cat
fi
## OLD: check script
## this is slow ....
# if [ $column -gt 0 ]; then
# echo "... filter Korean bitexts ($tmplang $tmpsrc $tmptrg)" >&2
# perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |
# tee >(cut -f1 > $tmpsrc) >(cut -f2 > $tmptrg) |
# cut -f$column |
# perl -CIOE -pe 'use utf8;s/\p{P}//g;s/[^\S\n]//g;s/▁//g;s/[0-9]//g' |
# langscript -a > $tmplang
# paste $tmplang $tmpsrc $tmptrg |
# grep $'Hang ([0-9]*)\s*\t' |
# cut -f2,3
# rm -f $tmplang $tmpsrc $tmptrg
# else
# cat
# fi

View File

@ -93,6 +93,7 @@ else
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"

View File

@ -53,6 +53,7 @@ sed -e 's//,/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2

View File

@ -0,0 +1 @@
newsdev2019-enkk.kaz.gz

View File

@ -0,0 +1 @@
newstest2019-enkk.kaz.gz

1
testsets/eng-kaz_Cyrl Symbolic link
View File

@ -0,0 +1 @@
eng-kaz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test1.de.gz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test1.fi.gz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test2.de.gz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test2.fi.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test1.deu.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test1.fin.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test2.deu.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test2.fin.gz