added recipes for tatoeba models other than English

2024-10-05 16:47:21 +03:00 · 2021-05-04 08:49:16 +03:00 · 2021-05-04 08:49:16 +03:00 · 200662863e
commit 200662863e
parent cde8f0d0af
13 changed files with 971 additions and 308 deletions
--- a/lib/data.mk
+++ b/lib/data.mk
@ -523,17 +523,17 @@ endif
 	@echo ""                                         >> ${dir ${DEV_SRC}}/README.md
 	@echo -n "* devset-selected: top "               >> ${dir ${DEV_SRC}}/README.md
 	@wc -l < ${DEV_SRC} | tr "\n" ' '                >> ${dir ${DEV_SRC}}/README.md
-	@echo " lines of ${notdir $@}.shuffled!"         >> ${dir ${DEV_SRC}}/README.md
+	@echo " lines of ${notdir $@}.shuffled"          >> ${dir ${DEV_SRC}}/README.md
 ifeq (${DEVSET},${TESTSET})
 	@echo -n "* testset-selected: next "             >> ${dir ${DEV_SRC}}/README.md
 	@wc -l < ${TEST_SRC} | tr "\n" ' '               >> ${dir ${DEV_SRC}}/README.md
-	@echo " lines of ${notdir $@}.shuffled!"         >> ${dir ${DEV_SRC}}/README.md
+	@echo " lines of ${notdir $@}.shuffled "         >> ${dir ${DEV_SRC}}/README.md
 	@echo "* devset-unused: added to traindata"      >> ${dir ${DEV_SRC}}/README.md
 	@echo "# Test data"                               > ${dir ${TEST_SRC}}/README.md
 	@echo ""                                         >> ${dir ${TEST_SRC}}/README.md
 	@echo -n "testset-selected: next "               >> ${dir ${TEST_SRC}}/README.md
 	@wc -l < ${TEST_SRC} | tr "\n" ' '               >> ${dir ${TEST_SRC}}/README.md
-	@echo " lines of ../val/${notdir $@}.shuffled!"  >> ${dir ${TEST_SRC}}/README.md
+	@echo " lines of ../val/${notdir $@}.shuffled"   >> ${dir ${TEST_SRC}}/README.md
 endif


--- a/lib/dist.mk
+++ b/lib/dist.mk
@ -34,6 +34,8 @@ get-model-distro  = ${shell echo ${wildcard ${1}/${2}/*.zip} | tr ' ' "\n" | LAN



+
+
 find-model:
 	@echo ${call get-model-dist,${LANGPAIRSTR}}

@ -42,7 +44,12 @@ find-model:
 MIN_BLEU_SCORE = 20

 .PHONY: dist local-dist global-dist release
-dist: ${DIST_PACKAGE}
+
+## create a symbolic link to the latest model
+## and make the package
+dist: 
+	${MAKE} link-latest-model
+	${MAKE} ${DIST_PACKAGE}

 ## local distribution in workhome, no restrictions about BLEU
 local-dist:
@ -206,9 +213,9 @@ RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}
 ## advantage: list all labels that are valid in the model
 ## disadvantage: can be misleading because we may have labels that are not trained
 ##
-LANGUAGELABELS    = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
-LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
-
+LANGUAGELABELS     = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
+LANGUAGELABELSRAW  = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
+LANGUAGELABELSUSED = $(filter ${TRGLANGS},${LANGUAGELABELSRAW})


 model-yml: ${MODEL_YML}
@ -249,24 +256,15 @@ ${MODEL_YML}: ${MODEL_FINAL}
 	@echo "dataset-name: $(DATASET)"                  >> $@
 	@echo "modeltype: $(MODELTYPE)"                   >> $@
 	@echo "vocabulary:"                               >> $@
-	@echo "   source: ${notdir ${MODEL_SRCVOCAB}}"  >> $@
-	@echo "   target: ${notdir ${MODEL_TRGVOCAB}}"  >> $@
+	@echo "   source: ${notdir ${MODEL_SRCVOCAB}}"    >> $@
+	@echo "   target: ${notdir ${MODEL_TRGVOCAB}}"    >> $@
 	@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
 	@echo "subwords:"                                 >> $@
-	@echo "   source: ${PRE_SRC}"                   >> $@
-	@echo "   target: ${PRE_TRG}"                   >> $@
+	@echo "   source: ${PRE_SRC}"                     >> $@
+	@echo "   target: ${PRE_TRG}"                     >> $@
 	@echo "subword-models:"                           >> $@
-	@echo "   source: source.${SUBWORD_TYPE}"       >> $@
-	@echo "   target: target.${SUBWORD_TYPE}"       >> $@
-ifdef USE_TARGET_LABELS
-	@echo "use-target-labels:"                        >> $@
-	@for t in ${LANGUAGELABELSRAW}; do \
-	  echo "   - \">>$$t<<\""                         >> $@; \
-	done
-#	@for t in ${TRGLANGS}; do \
-#	  echo "   - '>>$$t<<'"                           >> $@; \
-#	done
-endif
+	@echo "   source: source.${SUBWORD_TYPE}"         >> $@
+	@echo "   target: target.${SUBWORD_TYPE}"         >> $@
 	@echo "source-languages:"                         >> $@
 	@for s in ${RAWSRCLANGS}; do\
 	  echo "   - $$s"                                 >> $@; \
@ -275,17 +273,26 @@ endif
 	@for t in ${RAWTRGLANGS}; do\
 	  echo "   - $$t"                                 >> $@; \
 	done
+ifdef USE_TARGET_LABELS
+	@echo "use-target-labels:"                        >> $@
+	@for t in ${LANGUAGELABELSUSED}; do \
+	  echo "   - \">>$$t<<\""                         >> $@; \
+	done
+endif
 ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
 	@echo "training-data:"                            >> $@
 	@tr "\n" "~"  < ${WORKDIR}/train/README.md |\
 	tr "#" "\n" | grep '^ ${DATASET}~' | \
 	tail -1 | tr "~" "\n" | grep '^\* ' | \
 	grep -v ': *$$' | grep -v ' 0$$' | \
+	grep -v 'unused dev/test' | \
 	grep -v 'total size' | sed 's/^\* /   /'          >> $@
 endif
 ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
 	@echo "validation-data:"                          >> $@
 	grep '^\* ' ${WORKDIR}/val/README.md | \
+	sed 's/total size of shuffled dev data:/total-size-shuffled:/' | \
+	sed 's/devset =/devset-selected:/' | \
 	grep -v ' 0$$' | \
 	sed 's/^\* /   /'                                 >> $@
 endif
@ -390,7 +397,12 @@ endif



-
+link-latest-model:
+	if [ `ls ${patsubst %.zip,%-*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
+	  cd ${dir ${DIST_PACKAGE}}; \
+	  ln -s `ls -t ${patsubst %.zip,%-*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
+		${notdir ${DIST_PACKAGE}}; \
+	fi


 ${DIST_PACKAGE}: ${MODEL_FINAL}
@ -505,182 +517,6 @@ endif



-##### ------------------------------------
-##### OLD release recipe: all in one
-##### ------------------------------------
-
-
-# ${DIST_PACKAGE}: ${MODEL_FINAL}
-# ifneq (${SKIP_DIST_EVAL},1)
-# 	@${MAKE} $(TEST_EVALUATION)
-# 	@${MAKE} $(TEST_COMPARISON)
-# endif
-# 	@mkdir -p ${dir $@}
-# 	@touch ${WORKDIR}/source.tcmodel
-# 	@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
-# 	@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
-# 	@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
-# 	@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
-# ##-----------------------------
-# ## create YML file
-# ##-----------------------------
-# 	@echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip"  > ${@:.zip=}-${DATE}.yml
-# 	@echo "release-date: $(DATE)"                     >> ${@:.zip=}-${DATE}.yml
-# 	@echo "dataset-name: $(DATASET)"                  >> ${@:.zip=}-${DATE}.yml
-# 	@echo "modeltype: $(MODELTYPE)"                   >> ${@:.zip=}-${DATE}.yml
-# 	@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
-# 	@echo "subwords:"                                 >> ${@:.zip=}-${DATE}.yml
-# 	@echo "   - source: ${PRE_SRC}"                   >> ${@:.zip=}-${DATE}.yml
-# 	@echo "   - target: ${PRE_TRG}"                   >> ${@:.zip=}-${DATE}.yml
-# 	@echo "subword-models:"                           >> ${@:.zip=}-${DATE}.yml
-# 	@echo "   - source: source.${SUBWORD_TYPE}"       >> ${@:.zip=}-${DATE}.yml
-# 	@echo "   - target: target.${SUBWORD_TYPE}"       >> ${@:.zip=}-${DATE}.yml
-# ifdef USE_TARGET_LABELS
-# 	@echo "use-target-labels:"                        >> ${@:.zip=}-${DATE}.yml
-# 	@for t in ${TRGLANGS}; do \
-# 	  echo "   - >>$$t<<"                             >> ${@:.zip=}-${DATE}.yml; \
-# 	done
-# endif
-# 	@echo "source-languages:"                         >> ${@:.zip=}-${DATE}.yml
-# 	@for s in ${RAWSRCLANGS}; do\
-# 	  echo "   - $$s"                                 >> ${@:.zip=}-${DATE}.yml; \
-# 	done
-# 	@echo "target-languages:"                         >> ${@:.zip=}-${DATE}.yml
-# 	@for t in ${RAWTRGLANGS}; do\
-# 	  echo "   - $$t"                                 >> ${@:.zip=}-${DATE}.yml; \
-# 	done
-# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
-# 	@echo "training-data:"                            >> ${@:.zip=}-${DATE}.yml
-# 	@tr "\n" "~"  < ${WORKDIR}/train/README.md |\
-# 	tr "#" "\n" | grep '^ ${DATASET}~' | \
-# 	tail -1 | tr "~" "\n" | grep '^\* ' | \
-# 	grep -v ': *$$' | grep -v ' 0$$' | \
-# 	grep -v 'total size' | sed 's/^\* /   - /'        >> ${@:.zip=}-${DATE}.yml
-# endif
-# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
-# 	@echo "validation-data:"                          >> ${@:.zip=}-${DATE}.yml
-# 	grep '^\* ' ${WORKDIR}/val/README.md | \
-# 	grep -v ' 0$$' | \
-# 	sed 's/^\* /   - /'                               >> ${@:.zip=}-${DATE}.yml
-# endif
-# ##-----------------------------
-# ## create README-file
-# ##-----------------------------
-# 	@echo "# $(notdir ${@:.zip=})-${DATE}.zip"           > ${WORKDIR}/README.md
-# 	@echo ''                                            >> ${WORKDIR}/README.md
-# 	@echo "* dataset: ${DATASET}"                       >> ${WORKDIR}/README.md
-# 	@echo "* model: ${MODELTYPE}"                       >> ${WORKDIR}/README.md
-# 	@echo "* source language(s): ${RAWSRCLANGS}"        >> ${WORKDIR}/README.md
-# 	@echo "* target language(s): ${RAWTRGLANGS}"        >> ${WORKDIR}/README.md
-# 	@echo "* model: ${MODELTYPE}"                       >> ${WORKDIR}/README.md
-# 	@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
-# ifdef USE_TARGET_LABELS
-# 	echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
-# 	@echo "* valid language labels: ${LANGUAGELABELS}"  >> ${WORKDIR}/README.md
-# endif
-# 	@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
-# ifneq (${SKIP_DATA_DETAILS},1)
-# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
-# 	@echo -n "## Training data: "                       >> ${WORKDIR}/README.md
-# 	@tr "\n" "~"  < ${WORKDIR}/train/README.md |\
-# 	tr "#" "\n" | grep '${DATASET}' | \
-# 	tail -1 | tr "~" "\n"                               >> ${WORKDIR}/README.md
-# 	@echo ''                                            >> ${WORKDIR}/README.md
-# endif
-# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
-# 	@echo -n "#"                                        >> ${WORKDIR}/README.md
-# 	@cat ${WORKDIR}/val/README.md                       >> ${WORKDIR}/README.md
-# 	@echo ''                                            >> ${WORKDIR}/README.md
-# endif
-# endif
-# ##-----------------------------
-# ## add benchmark results
-# ##-----------------------------
-# ifneq ("$(wildcard ${TEST_EVALUATION})","")
-# 	@echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
-# 	@echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
-# 	@echo '' >> ${WORKDIR}/README.md
-# 	@echo '## Benchmarks'                                       >> ${WORKDIR}/README.md
-# 	@echo ''                                                    >> ${WORKDIR}/README.md
-# ## grep and normalise test set names
-# ## ugly perl script that does some tansformation of language codes
-# 	@grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
-# 	sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
-# 	perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
-# 	@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
-# 	cut -f3 -d ' ' > $@.2
-# 	@grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
-# 	cut -f3 -d ' ' > $@.3
-# 	@ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
-# 	sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
-# 	@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
-# 	cut -f16 -d ' ' | sed 's/)//' > $@.5
-# 	@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
-# 	cut -f7 -d ' ' > $@.6
-# 	@paste -d '/' $@.4 $@.5                                      > $@.7
-# 	@echo '| testset | BLEU  | chr-F | #sent | #words | BP |'   >> ${WORKDIR}/README.md
-# 	@echo '|---------|-------|-------|-------|--------|----|'   >> ${WORKDIR}/README.md
-# 	@paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
-# 	sed  "s/\t/ 	| /g;s/^/| /;s/$$/ |/" | \
-# 	sort | uniq                                                 >> ${WORKDIR}/README.md
-# 	@echo "test-data:"                                          >> ${@:.zip=}-${DATE}.yml
-# 	@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/   - /;'          >> ${@:.zip=}-${DATE}.yml
-# 	@echo "BLEU-scores:"                                        >> ${@:.zip=}-${DATE}.yml
-# 	@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/   - /'           >> ${@:.zip=}-${DATE}.yml
-# 	@echo "chr-F-scores:"                                       >> ${@:.zip=}-${DATE}.yml
-# 	@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/   - /'           >> ${@:.zip=}-${DATE}.yml
-# 	@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
-# endif
-# ##-----------------------------
-# ## create the package
-# ##-----------------------------
-# 	@cat ${WORKDIR}/README.md                           >> ${dir $@}README.md
-# 	@echo ''                                            >> ${dir $@}README.md
-# 	@cp models/LICENSE ${WORKDIR}/
-# 	@chmod +x ${WORKDIR}/preprocess.sh
-# 	@sed -e 's# - .*/\([^/]*\)$$# - \1#' \
-# 	     -e 's/beam-size: [0-9]*$$/beam-size: 6/' \
-# 	     -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
-# 	     -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
-# 	     -e 's/relative-paths: false/relative-paths: true/' \
-# 	< ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
-# 	cd ${WORKDIR} && zip ${notdir $@} \
-# 		README.md LICENSE \
-# 		${notdir ${MODEL_FINAL}} \
-# 		${notdir ${MODEL_SRCVOCAB}} \
-# 		${notdir ${MODEL_TRGVOCAB}} \
-# 		${notdir ${MODEL_VALIDLOG}} \
-# 		${notdir ${MODEL_TRAINLOG}} \
-# 		source.* target.* decoder.yml \
-# 		preprocess.sh postprocess.sh
-# ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
-# 	@cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
-# endif
-# ##-----------------------------
-# ## move files to release dir and cleanup
-# ##-----------------------------
-# 	@mkdir -p ${dir $@}
-# 	@mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
-# 	@cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
-# ifneq ("$(wildcard ${TEST_EVALUATION})","")
-# 	@cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
-# 	@cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
-# endif
-# 	@rm -f $@
-# 	@cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
-# 	@rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
-# 	@rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
-
-
-
-
-
-
-
-
-
-
-

 ## do this only if the flag is set
 ## --> avoid expensive wildcard searches each time make is called
@ -695,6 +531,7 @@ endif
 #	source project_2000661-openrc.sh
 #
 # - make upload ......... released models = all sub-dirs in models/
+# - make upload-model ... upload model for current language pair
 # - make upload-models .. trained models in current WORKHOME to OPUS-MT-dev
 # - make upload-scores .. score file with benchmark results to OPUS-MT-eval
 # - make upload-eval .... benchmark tests from models in WORKHOME
@ -712,6 +549,17 @@ upload:
 	swift upload ${MODEL_CONTAINER} index.txt
 	rm -f index.txt

+.PHONY: upload-model
+upload-model:
+	find ${RELEASEDIR}/ -type l | tar -cf models-links.tar -T -
+	find ${RELEASEDIR}/ -type l -delete
+	cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical ${LANGPAIRSTR}
+	tar -xf models-links.tar
+	rm -f models-links.tar
+	swift post ${MODEL_CONTAINER} --read-acl ".r:*"
+	swift list ${MODEL_CONTAINER} > index.txt
+	swift upload ${MODEL_CONTAINER} index.txt
+	rm -f index.txt

 .PHONY: upload-models
 upload-models:
@ -968,7 +816,7 @@ dist-remove-no-date-dist:

 dist-remove-old-yml:
 	swift list Tatoeba-MT-models > index.txt
-	for d in `grep old-yml index.txt`; do \
+	for d in `grep yml-old index.txt`; do \
 	  swift delete Tatoeba-MT-models $$d; \
 	done

@ -993,3 +841,21 @@ dist-fix-preprocess:
 	    rm -f $$d; \
 	  done )

+
+
+## fix yet another error in YAML files
+
+# YMLFILES = ${wildcard models-tatoeba/eng-*/*-2021-04-10.yml}
+# OLDYMLFILES = ${patsubst %.yml,%.yml-old,${YMLFILES}}
+
+# ${OLDYMLFILES}: %.yml-old: %.yml
+# 	mv $< $@
+# 	sed 	-e 's/devset =/devset-selected:/' \
+# 		-e 's/testset =/testset-selected:/' \
+# 		-e 's/total size of shuffled dev data:/total-size-shuffled:/' < $@ |\
+# 	grep -v 'unused dev/test' > $<
+# 	touch $@
+
+
+
+# fix-yml-files: ${OLDYMLFILES}
--- a/lib/env.mk
+++ b/lib/env.mk
@ -68,8 +68,8 @@ WORKHOME = ${PWD}/work


 ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
-#  CSCPROJECT    = project_2002688
-  CSCPROJECT   = project_2003093
+  CSCPROJECT    = project_2002688
+#  CSCPROJECT   = project_2003093
 #  CSCPROJECT   = project_2002982
  WORKHOME      = ${shell realpath ${PWD}/work}
  APPLHOME      = /projappl/project_2003093/
@ -182,7 +182,6 @@ MARIAN_VOCAB   = ${MARIAN_HOME}marian-vocab
 TOKENIZER    = ${MOSESSCRIPTS}/tokenizer


-
 ## BPE
 SUBWORD_BPE  ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
 SUBWORD_HOME ?= ${dir ${SUBWORD_BPE}}
--- a/lib/generic.mk
+++ b/lib/generic.mk
@ -270,6 +270,18 @@ endif
 #		CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \


+## train on back-translations only
+%-btonly:
+	rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
+	${MAKE} DATASET=${DATASET}+btonly \
+		USE_BACKTRANS=1 \
+		CONTINUE_EXISTING=1 \
+		MODELCONFIG=config-bt.mk \
+		TRAINSET= TATOEBA_TRAINSET= \
+	${@:-btonly=}
+
+
+

 PIVOT_MODEL       = ${MODEL_SUBDIR}${DATASET}+pivot${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
 PIVOT_MODEL_BASE  = ${PIVOT_MODEL}.${MODELTYPE}.model${NR}
--- a/lib/preprocess.mk
+++ b/lib/preprocess.mk
@ -13,12 +13,18 @@
 ## - should we increase the length filter when cleaning later? How much?
 ## - should we apply some other cleanup scripts here to get rid of some messy stuff?

+
+## should we remove zero-width spaces?
+##   perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
+
 %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
 	cat ${word 1,$^} |\
 	perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
+	perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
 	perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.1
 	cat ${word 2,$^} |\
 	perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
+	perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
 	perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.2
 	paste $@.1 $@.2 |\
 	scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
--- a/lib/projects/tatoeba.mk
+++ b/lib/projects/tatoeba.mk
@ -106,6 +106,24 @@ TATOEBA_WORK      ?= ${PWD}/work-tatoeba
 TATOEBA_DATA      ?= ${TATOEBA_WORK}/data/${PRE}
 TATOEBA_MONO      ?= ${TATOEBA_WORK}/data/mono

+
+## data count files (file basename)
+TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT}/data/release/${TATOEBA_VERSION}/released-bitexts
+
+## all released language pairs with test sets > 200 test pairs
+## also extract all source languages that are available for a give target language
+## and vice versa
+TATOEBA_RELEASED_DATA   = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
+TATOEBA_AVAILABLE_TRG   = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
+TATOEBA_AVAILABLE_SRC   = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
+
+## extract language pairs for a specific subset
+TATOEBA_SUBSET               = lower
+TATOEBA_RELEASED_SUBSET      = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
+TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
+TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
+
+
 WIKILANGS         ?= ${notdir ${wildcard backtranslate/wiki-iso639-3/*}}
 WIKIMACROLANGS    ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}})

@ -119,7 +137,7 @@ TATOEBA_TESTSET_NAME = Tatoeba-test
 TATOEBA_RELEASEDIR   = ${PWD}/models-tatoeba
 TATOEBA_MODELSHOME   = ${PWD}/models-tatoeba
 TATOEBA_BTHOME       = ${PWD}/bt-tatoeba
-
+TATOEBA_MIN_BLEU     = 10

 ## file with the source and target languages in the current model

@ -163,7 +181,7 @@ TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
 		MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
 		ALT_MODEL_DIR=tatoeba \
 		SKIP_DATA_DETAILS=1 \
-		MIN_BLEU_SCORE=10
+		MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU}



@ -340,6 +358,92 @@ tatoeba-refresh-finished:
 	done


+###########################################################################################
+# start combinations with a specific source/target language
+###########################################################################################
+#
+# make SRC=deu tatoeba-src2all-reasonable
+# make SRC=deu tatoeba-src2all-small
+#
+# make TRG=deu tatoeba-all2trg-reasonable
+# make TRG=deu tatoeba-all2trg-small
+#
+
+
+tatoeba-src2all:
+	for l in ${TATOEBA_AVAILABLE_SUBSET_TRG}; do \
+	    ${MAKE} tatoeba-${SRC}2$$l-train; \
+	done
+
+tatoeba-src2langgroup:
+	for l in ${sort ${shell langgroup -p -n ${TATOEBA_AVAILABLE_SUBSET_TRG} 2>/dev/null}}; do \
+	    ${MAKE} tatoeba-${SRC}2$$l-train-1m; \
+	done
+
+
+tatoeba-all2trg:
+	for l in ${TATOEBA_AVAILABLE_SUBSET_SRC}; do \
+	    ${MAKE} tatoeba-$${l}2${TRG}-train; \
+	done
+
+tatoeba-all2trg-print:
+	for l in ${TATOEBA_AVAILABLE_SUBSET_SRC}; do \
+	    echo "${MAKE} tatoeba-$${l}2${TRG}-train"; \
+	done
+
+
+tatoeba-langgroup2trg:
+	for l in ${sort ${shell langgroup -p -n ${TATOEBA_AVAILABLE_SUBSET_SRC} 2>/dev/null}}; do \
+	    ${MAKE} tatoeba-$${l}2${TRG}-train-1m; \
+	done
+
+
+## all subsets
+
+tatoeba-src2all-subsets:
+	${MAKE} TATOEBA_SUBSET=lowest  tatoeba-src2all
+	${MAKE} TATOEBA_SUBSET=lower   tatoeba-src2all
+	${MAKE} TATOEBA_SUBSET=medium  tatoeba-src2all
+	${MAKE} TATOEBA_SUBSET=higher  tatoeba-src2all
+	${MAKE} TATOEBA_SUBSET=highest tatoeba-src2all
+
+tatoeba-all2trg-subsets:
+	${MAKE} TATOEBA_SUBSET=lowest  tatoeba-all2trg
+	${MAKE} TATOEBA_SUBSET=lower   tatoeba-all2trg
+	${MAKE} TATOEBA_SUBSET=medium  tatoeba-all2trg
+	${MAKE} TATOEBA_SUBSET=higher  tatoeba-all2trg
+	${MAKE} TATOEBA_SUBSET=highest tatoeba-all2trg
+
+
+## reasonable size (all except lower and lowest)
+
+tatoeba-src2all-reasonable:
+	${MAKE} TATOEBA_SUBSET=medium  tatoeba-src2all
+	${MAKE} TATOEBA_SUBSET=higher  tatoeba-src2all
+	${MAKE} TATOEBA_SUBSET=highest tatoeba-src2all
+
+tatoeba-all2trg-reasonable:
+	${MAKE} TATOEBA_SUBSET=medium  tatoeba-all2trg
+	${MAKE} TATOEBA_SUBSET=higher  tatoeba-all2trg
+	${MAKE} TATOEBA_SUBSET=highest tatoeba-all2trg
+
+
+## backoff to multilingual models and language groups
+## lower / lowest resource languages and zero-shot
+
+tatoeba-src2all-small:
+	${MAKE} TATOEBA_SUBSET=lower     tatoeba-src2langgroup
+	${MAKE} TATOEBA_SUBSET=lowest    tatoeba-src2langgroup
+	${MAKE} TATOEBA_SUBSET=zero-shot tatoeba-src2langgroup
+
+tatoeba-all2trg-small:
+	${MAKE} TATOEBA_SUBSET=lower     tatoeba-langgroup2trg
+	${MAKE} TATOEBA_SUBSET=lowest    tatoeba-langgroup2trg
+	${MAKE} TATOEBA_SUBSET=zero-shot tatoeba-langgroup2trg
+
+
+
+

 ###########################################################################################
 # models for backtranslation
@ -1732,11 +1836,12 @@ ${TATOEBA_MONO}/%.labels:
 	            paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
-	            grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
+	            grep -P "$$s\t$$t\t" | cut -f3,4 |\
+		    scripts/filter/filter-korean.sh ${SRC} ${TRG} $$d > ${dir $@}Tatoeba-$$d.$$s-$$t; \
 	            if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
 	              echo "........ make ${dir $@}Tatoeba-$$d.$$s-$$t.clean.*.gz"; \
-	              cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
-	              cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
+	              cut -f1 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
+	              cut -f2 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
 	            fi; \
 	            rm -f ${dir $@}Tatoeba-$$d.$$s-$$t; \
 		  fi \
@ -1748,11 +1853,12 @@ ${TATOEBA_MONO}/%.labels:
 	            paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
-	            grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
+	            grep -P "$$s\t$$t\t" | cut -f3,4 |\
+		    scripts/filter/filter-korean.sh ${TRG} ${SRC} $$d > ${dir $@}Tatoeba-$$d.$$t-$$s; \
 	            if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
 	              echo "........ make ${dir $@}Tatoeba-$$d.$$t-$$s.clean.*.gz"; \
-	              cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
-	              cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
+	              cut -f1 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
+	              cut -f2 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
 	            fi; \
 	            rm -f ${dir $@}Tatoeba-$$d.$$t-$$s; \
 		  fi \
--- a/lib/projects/wikimedia.mk
+++ b/lib/projects/wikimedia.mk
@ -9,6 +9,11 @@ as-en:
 	${MAKE} reverse-data-as-en
 	${MAKE} train-dynamic-en-as

+
+BCL_DEVSIZE = 1000
+BCL_TESTSIZE = 1000
+
+
 en-bcl:
 	${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia all-job

@ -29,25 +34,31 @@ bcl-en-nt:
 		DEVSET=wikimedia all-job

 %-en-bcl:
-	${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
+	${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
+		DEVSET=wikimedia \
+		DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
+		USE_REST_DEVDATA=1 ${@:-en-bcl=}


 %-bcl-en:
-	${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-bcl-en=}
+	${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
+		DEVSET=wikimedia \
+		DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
+		USE_REST_DEVDATA=1 ${@:-bcl-en=}


 %-en-bcl-nt:
 	${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
 		DATASET=${DATASET}+nt \
 		EXTRA_TRAINSET="new-testament" \
-		DEVSET=wikimedia \
+		DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
 	${@:-en-bcl-nt=}

 %-bcl-en-nt:
 	${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
 		DATASET=${DATASET}+nt \
 		EXTRA_TRAINSET="new-testament" \
-		DEVSET=wikimedia \
+		DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
 	${@:-bcl-en-nt=}


@ -72,34 +83,228 @@ ENBCL_BPE = 1000



+#-----------------------------------------------------------------------------
+# start jobs for all languages where we have back-translations into English
+#-----------------------------------------------------------------------------
+
+## languages for which we have back translated wiki data into English
+WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul bos_Latn cmn_Hans cmn_Hant hrv ind nno nob srp_Cyrl srp_Latn


-WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul
-
-
-## start jobs for all languages where we have back-translations
-
+## start jobs for all languages where we have back-translations into English
 wiki-eng2all-with-bt:
 	for l in ${WIKI_BT2ENG}; do \
-	  if [ -d work-tatoeba/$$l-eng ]; then \
-	    if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
-	      echo "fetch back-translations for $$l-eng"; \
-	      ${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
-	      echo "start training eng-$$l with backtranslation data"; \
-	      ${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
+	   echo "fetch $$l wiki backtranslations"; \
+	  ${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
+	done
+	for l in ${sort ${shell iso639 -m -n ${WIKI_BT2ENG}}}; do \
+	   echo "start training eng-$$l with backtranslation data"; \
+	   ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
+	done
+
+#	for l in ${WIKI_BT2ENG}; do \
+#	  if [ -d work-tatoeba/$$l-eng ]; then \
+#	    if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
+#	      echo "fetch back-translations for $$l-eng"; \
+#	      ${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
+#	      echo "start training eng-$$l with backtranslation data"; \
+#	      ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
+#	    fi \
+#	  fi \
+#	done
+
+wiki-eng2all-with-bt-continue:
+	for l in ${WIKI_BT2ENG}; do \
+	  if [ -d work-tatoeba/eng-$$l ]; then \
+	    if [ ! `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
+	      echo "continue training eng-$$l with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-eng2$$l-train-bt; \
 	    fi \
 	  fi \
 	done

-WIKI_BT2ENG_PARENTS = ${sort ${shell langgroup -p ${WIKI_BT2ENG}}}
+wiki-eng2all-with-bt-eval:
+	for l in ${WIKI_BT2ENG}; do \
+          if [ -d work-tatoeba/eng-$$l ]; then \
+            if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+              ${MAKE} EMAIL= WALLTIME=4 tatoeba-eng2$$l-evalall-bt.submit; \
+            fi \
+          fi \
+        done

-wiki-eng2allgroups-with-bt:
-	for l in $(filter-out roa,${WIKI_BT2ENG_PARENTS}); do \
+#	    if [ `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
+
+wiki-eng2all-with-bt-dist:
+	for l in ${WIKI_BT2ENG}; do \
 	  if [ -d work-tatoeba/eng-$$l ]; then \
-	    echo "mv work-tatoeba/eng-$$l work-tatoeba-old"; \
-	    mv work-tatoeba/eng-$$l work-tatoeba-old; \
-	  fi; \
-	  echo "start training eng-$$l with backtranslation data"; \
-	  ${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
+	    if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+	      echo "continue training eng-$$l with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt; \
+	    fi \
+	  fi \
 	done

+
+
+#-----------------------------------------------------------------------------
+# models for translating English into language groups with backtranslations 
+# (does not fetch back-translations - they need to be available in bt-tatoeba!)
+#-----------------------------------------------------------------------------
+
+WIKI_BT2ENG_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_BT2ENG} | xargs langgroup -p}}
+
+wiki-eng2allgroups-with-bt:
+	for l in ${WIKI_BT2ENG_PARENTS}; do \
+	  echo "start training eng-$$l with backtranslation data"; \
+	  ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
+	done
+
+wiki-eng2allgroups-with-bt-continue:
+	for l in ${WIKI_BT2ENG_PARENTS}; do \
+	  if [ -d work-tatoeba/eng-$$l ]; then \
+	    if [ ! `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
+	      echo "continue training eng-$$l with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-eng2$$l-train-bt-1m; \
+	    fi \
+	  fi \
+	done
+
+wiki-eng2allgroups-with-bt-eval:
+	for l in ${WIKI_BT2ENG_PARENTS}; do \
+          if [ -d work-tatoeba/eng-$$l ]; then \
+            if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+              ${MAKE} EMAIL= WALLTIME=8 tatoeba-eng2$$l-evalall-bt-1m.submit; \
+            fi \
+          fi \
+        done
+
+#	    if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
+
+wiki-eng2allgroups-with-bt-dist:
+	for l in ${WIKI_BT2ENG_PARENTS}; do \
+	  if [ -d work-tatoeba/eng-$$l ]; then \
+	    if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+	      echo "continue training eng-$$l with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt-1m; \
+	    fi \
+	  fi \
+	done
+
+
+
+
+
+
+#-----------------------------------------------------------------------------
+# start jobs for all languages where we have back-translations from English
+#-----------------------------------------------------------------------------
+
+## languages for which we have back translated wiki data from English
+
+WIKI_ENG2BT = afr ara aze bel ben bos_Latn bre bul cat ceb ces cmn_Hans cmn_Hant cym dan deu ell epo est eus fao fin fra fry gle glg heb hin hrv hun hye ido ilo ina ind isl ita lav lit ltz mal mar mkd mlt msa nds nld nno nob pol por ron run rus spa sqi srp_Cyrl srp_Latn swa swe tam tgl tha tur ukr urd uzb_Latn vie war zho zsm_Latn
+
+
+wiki-all2eng-with-bt:
+	for l in ${WIKI_ENG2BT}; do \
+	   echo "fetch $$l wiki backtranslations"; \
+	  ${MAKE} -C bt-tatoeba TRG=$$l SRC=eng fetch-bt; \
+	done
+	for l in ${sort ${shell iso639 -m -n ${WIKI_ENG2BT}}}; do \
+	  if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
+	    if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+	       echo "continue training $$l-eng with backtranslation data"; \
+	       ${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
+	    else \
+	       echo "start training $$l-eng with backtranslation data"; \
+	       ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt.submitcpu; \
+	    fi \
+	  fi \
+	done
+
+
+wiki-all2eng-with-bt-continue:
+	for l in ${WIKI_ENG2BT}; do \
+	  if [ -d work-tatoeba/eng-$$l ]; then \
+	    if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
+	      echo "continue training $$l-eng with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
+	    fi \
+	  fi \
+	done
+
+wiki-all2eng-with-bt-eval:
+	for l in ${WIKI_ENG2BT}; do \
+          if [ -d work-tatoeba/$$l-eng ]; then \
+            if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+              ${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt.submit; \
+            fi \
+          fi \
+        done
+
+#	    if [ `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
+
+wiki-all2eng-with-bt-dist:
+	for l in ${WIKI_ENG2BT}; do \
+	  if [ -d work-tatoeba/$$l-eng ]; then \
+	    if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+	      echo "continue training $$l-eng with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt; \
+	    fi \
+	  fi \
+	done
+
+
+
+
+
+
+
+
+
+
+WIKI_ENG2BT_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_ENG2BT} | xargs langgroup -p}}
+
+wiki-allgroups2eng-with-bt:
+	for l in ${WIKI_ENG2BT_PARENTS}; do \
+	  if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
+	    if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+	       echo "continue training $$l-eng with backtranslation data"; \
+	       ${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
+	    else \
+	       echo "start training $$l-eng with backtranslation data"; \
+	       ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt-1m.submitcpu; \
+	    fi \
+	  fi \
+	done
+
+
+wiki-allgroups2eng-with-bt-continue:
+	for l in ${WIKI_ENG2BT_PARENTS}; do \
+	  if [ -d work-tatoeba/eng-$$l ]; then \
+	    if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
+	      echo "continue training $$l-eng with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
+	    fi \
+	  fi \
+	done
+
+wiki-allgroups2eng-with-bt-eval:
+	for l in ${WIKI_ENG2BT_PARENTS}; do \
+          if [ -d work-tatoeba/$$l-eng ]; then \
+            if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+              ${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt-1m.submit; \
+            fi \
+          fi \
+        done
+
+#	    if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
+
+wiki-allgroups2eng-with-bt-dist:
+	for l in ${WIKI_ENG2BT_PARENTS}; do \
+	  if [ -d work-tatoeba/$$l-eng ]; then \
+	    if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
+	      echo "continue training $$l-eng with backtranslation data"; \
+	      ${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt-1m; \
+	    fi \
+	  fi \
+	done
--- a/models/bcl-en/README.md
+++ b/models/bcl-en/README.md
@ -1,3 +1,8 @@
+# 
+
+
+
+

 # opus-2020-01-20.zip

@ -15,6 +20,11 @@
 | JW300.bcl.en 	| 56.8 	| 0.705 |


+
+
+
+
+
 # opus-2020-02-11.zip

 * dataset: opus
@ -31,6 +41,11 @@
 | JW300.bcl.en 	| 56.1 	| 0.697 |


+
+
+
+
+
 # opus+bt-2020-05-23.zip

 * dataset: opus+bt
@ -58,6 +73,11 @@
 | JW300.bcl.en 	| 57.6 	| 0.712 |


+
+
+
+
+
 # opus+nt-2021-03-29.zip

 * dataset: opus+nt
@ -92,3 +112,190 @@
 |---------|-------|-------|-------|--------|----|
 | wikimedia.bcl-en 	| 10.4 	| 0.320 	| 525 	| 27109 	| 0.477 |

+
+
+
+
+
+# opus+nt+bt-2021-04-01.zip
+
+* dataset: opus+nt+bt
+* model: transformer-align
+* source language(s): bcl
+* target language(s): en
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* download: [opus+nt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.zip)
+## Training data:  opus+nt+bt
+
+* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikinews.aa.en-bcl (357946) 
+* bcl-en: total size = 1809858
+* total size (opus+nt+bt): 1809767
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.test.txt)
+* test set scores: [opus+nt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.bcl-en 	| 28.2 	| 0.498 	| 525 	| 27109 	| 0.799 |
+
+
+
+
+
+# opus+nt+bt+bt-2021-04-03.zip
+
+* dataset: opus+nt+bt+bt
+* model: transformer-align
+* source language(s): bcl
+* target language(s): en
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* download: [opus+nt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.zip)
+## Training data:  opus+nt+bt+bt
+
+* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077) 
+* bcl-en: total size = 4730330
+* total size (opus+nt+bt+bt): 4730231
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.test.txt)
+* test set scores: [opus+nt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.bcl-en 	| 16.2 	| 0.461 	| 525 	| 27109 	| 1.000 |
+
+
+
+
+# opus+nt+bt+bt+bt-2021-04-05.zip
+
+* dataset: opus+nt+bt+bt+bt
+* model: transformer-align
+* source language(s): bcl
+* target language(s): en
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* download: [opus+nt+bt+bt+bt-2021-04-05.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.zip)
+## Training data:  opus+nt+bt+bt+bt
+
+* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077) 
+* bcl-en: total size = 4730330
+* total size (opus+nt+bt+bt+bt): 4730224
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt+bt+bt-2021-04-05.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.test.txt)
+* test set scores: [opus+nt+bt+bt+bt-2021-04-05.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.bcl-en 	| 24.2 	| 0.497 	| 525 	| 27109 	| 1.000 |
+
+
+
+# opus+nt+bt-2021-04-09.zip
+
+* dataset: opus+nt+bt
+* model: transformer-align
+* source language(s): bcl
+* target language(s): en
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* download: [opus+nt+bt-2021-04-09.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.zip)
+## Training data:  opus+nt+bt
+
+* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077) 
+* bcl-en: total size = 4730330
+* unused dev/test data is added to training data
+* total size (opus+nt+bt): 4731419
+
+
+## Validation data
+
+* bcl-en: wikimedia, 2767
+* total-size-shuffled: 1966
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 500  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt-2021-04-09.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.test.txt)
+* test set scores: [opus+nt+bt-2021-04-09.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.bcl-en 	| 33.5 	| 0.562 	| 500 	| 28621 	| 0.868 |
+
+
+# opus+nt+bt-2021-04-12.zip
+
+* dataset: opus+nt+bt
+* model: transformer-align
+* source language(s): bcl
+* target language(s): en
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* download: [opus+nt+bt-2021-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.zip)
+## Training data:  opus+nt+bt
+
+* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077) 
+* bcl-en: total size = 4730330
+* unused dev/test data is added to training data
+* total size (opus+nt+bt): 4732437
+
+
+## Validation data
+
+* bcl-en: wikimedia, 5033
+* total-size-shuffled: 4207
+
+* devset-selected: top 1000  lines of wikimedia.src.shuffled!
+* testset-selected: next 1000  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt-2021-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.test.txt)
+* test set scores: [opus+nt+bt-2021-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.bcl-en 	| 31.5 	| 0.523 	| 1000 	| 31520 	| 0.836 |
+
--- a/models/en-bcl/README.md
+++ b/models/en-bcl/README.md
@ -1,86 +1,261 @@
-# wikimedia-2020-01-17.zip
+# opus+nt+bt-2021-03-30.zip

-* dataset: wikimedia
-* model: transformer-align
-* pre-processing: normalization + SentencePiece
-* download: [wikimedia-2020-01-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.zip)
-* test set translations: [wikimedia-2020-01-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.test.txt)
-* test set scores: [wikimedia-2020-01-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.eval.txt)
-
-## Benchmarks
-
-| testset               | BLEU  | chr-F |
-|-----------------------|-------|-------|
-| JW300.en.bcl 	| 55.3 	| 0.729 |
-
-# opus-2020-01-20.zip
-
-* dataset: opus
-* model: transformer-align
-* pre-processing: normalization + SentencePiece
-* download: [opus-2020-01-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.zip)
-* test set translations: [opus-2020-01-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.test.txt)
-* test set scores: [opus-2020-01-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.eval.txt)
-
-## Benchmarks
-
-| testset               | BLEU  | chr-F |
-|-----------------------|-------|-------|
-| JW300.en.bcl 	| 55.3 	| 0.729 |
-
-# opus-2020-02-11.zip
-
-* dataset: opus
-* model: transformer-align
-* pre-processing: normalization + SentencePiece
-* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
-* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
-* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
-
-## Benchmarks
-
-| testset               | BLEU  | chr-F |
-|-----------------------|-------|-------|
-| JW300.en.bcl 	| 53.8 	| 0.719 |
-
-# opus+bt-2020-02-26.zip
-
-* dataset: opus+bt
-* model: transformer-align
-* pre-processing: normalization + SentencePiece
-* download: [opus+bt-2020-02-26.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.zip)
-* test set translations: [opus+bt-2020-02-26.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.test.txt)
-* test set scores: [opus+bt-2020-02-26.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.eval.txt)
-
-## Benchmarks
-
-| testset               | BLEU  | chr-F |
-|-----------------------|-------|-------|
-| JW300.en.bcl 	| 54.3 	| 0.722 |
-
-# opus+bt-2020-05-23.zip
-
-* dataset: opus+bt
+* dataset: opus+nt+bt
 * model: transformer-align
 * source language(s): en
 * target language(s): bcl
 * model: transformer-align
-* pre-processing: normalization + SentencePiece (spm4k,spm4k)
-* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.zip)
-* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.test.txt)
-* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.eval.txt)
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus+nt+bt-2021-03-30.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.zip)
+## Training data:  opus+nt+bt

-## Training data:  opus+bt
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (43432) 
+* en-bcl: total size = 525523
+* total size (opus+nt+bt): 525475

-* en-bcl: wikimedia (1106) 
-* en-bcl: total size = 1106
-* unused dev/test data is added to training data
-* total size (opus+bt): 458304

+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt-2021-03-30.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.test.txt)
+* test set scores: [opus+nt+bt-2021-03-30.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.eval.txt)

 ## Benchmarks

-| testset               | BLEU  | chr-F |
-|-----------------------|-------|-------|
-| JW300.en.bcl 	| 55.7 	| 0.736 |
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 17.3 	| 0.426 	| 525 	| 28399 	| 0.840 |
+
+
+
+
+
+
+
+# opus+nt+bt+bt-2021-04-01.zip
+
+* dataset: opus+nt+bt+bt
+* model: transformer-align
+* source language(s): en
+* target language(s): bcl
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus+nt+bt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.zip)
+## Training data:  opus+nt+bt+bt
+
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) 
+* en-bcl: total size = 527565
+* total size (opus+nt+bt+bt): 527524
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.test.txt)
+* test set scores: [opus+nt+bt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 21.6 	| 0.476 	| 525 	| 28399 	| 0.789 |
+
+
+
+
+
+
+# opus+nt+bt+bt+bt-2021-04-03.zip
+
+* dataset: opus+nt+bt+bt+bt
+* model: transformer-align
+* source language(s): en
+* target language(s): bcl
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.zip)
+## Training data:  opus+nt+bt+bt+bt
+
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) 
+* en-bcl: total size = 527565
+* total size (opus+nt+bt+bt+bt): 527496
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.test.txt)
+* test set scores: [opus+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 22.7 	| 0.482 	| 525 	| 28399 	| 0.895 |
+
+
+
+
+
+# opus2+nt+bt+bt+bt-2021-04-03.zip
+
+* dataset: opus2+nt+bt+bt+bt
+* model: transformer-align
+* source language(s): en
+* target language(s): bcl
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus2+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.zip)
+## Training data:  opus2+nt+bt+bt+bt
+
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474) 
+* en-bcl: total size = 573039
+* total size (opus2+nt+bt+bt+bt): 572969
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus2+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.test.txt)
+* test set scores: [opus2+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 23.9 	| 0.497 	| 525 	| 28399 	| 0.820 |
+
+
+
+
+# opus+nt+bt+bt+bt+bt-2021-04-06.zip
+
+* dataset: opus+nt+bt+bt+bt+bt
+* model: transformer-align
+* source language(s): en
+* target language(s): bcl
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus+nt+bt+bt+bt+bt-2021-04-06.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.zip)
+## Training data:  opus+nt+bt+bt+bt+bt
+
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474) 
+* en-bcl: total size = 618513
+* total size (opus+nt+bt+bt+bt+bt): 618427
+
+
+## Validation data
+
+* bcl-en: wikimedia, 1153
+* total-size-shuffled: 775
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 525  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt+bt+bt+bt-2021-04-06.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.test.txt)
+* test set scores: [opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 24.4 	| 0.498 	| 525 	| 28399 	| 0.805 |
+
+
+
+# opus+nt+bt+bt-2021-04-10.zip
+
+* dataset: opus+nt+bt+bt
+* model: transformer-align
+* source language(s): en
+* target language(s): bcl
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus+nt+bt+bt-2021-04-10.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.zip)
+## Training data:  opus+nt+bt+bt
+
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474) 
+* en-bcl: total size = 664007
+* unused dev/test data is added to training data
+* total size (opus+nt+bt+bt): 665111
+
+
+## Validation data
+
+* bcl-en: wikimedia, 2767
+* total-size-shuffled: 1966
+
+* devset-selected: top 250  lines of wikimedia.src.shuffled!
+* testset-selected: next 500  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt+bt-2021-04-10.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.test.txt)
+* test set scores: [opus+nt+bt+bt-2021-04-10.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 30.7 	| 0.572 	| 500 	| 29131 	| 0.921 |
+
+
+# opus+nt+bt-2021-04-11.zip
+
+* dataset: opus+nt+bt
+* model: transformer-align
+* source language(s): en
+* target language(s): bcl
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm12k,spm32k)
+* download: [opus+nt+bt-2021-04-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.zip)
+## Training data:  opus+nt+bt
+
+* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474) 
+* en-bcl: total size = 664007
+* unused dev/test data is added to training data
+* total size (opus+nt+bt): 666118
+
+
+## Validation data
+
+* bcl-en: wikimedia, 5033
+* total-size-shuffled: 4207
+
+* devset-selected: top 1000  lines of wikimedia.src.shuffled!
+* testset-selected: next 1000  lines of wikimedia.src.shuffled!
+* devset-unused: added to traindata
+
+* test set translations: [opus+nt+bt-2021-04-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.test.txt)
+* test set scores: [opus+nt+bt-2021-04-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.eval.txt)
+
+## Benchmarks
+
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| wikimedia.en-bcl 	| 31.9 	| 0.585 	| 1000 	| 27681 	| 1.000 |

--- a/scripts/evaluate/compare-bt.pl
+++ b/scripts/evaluate/compare-bt.pl
@ -0,0 +1,31 @@
+
+my %basemodel = ();
+my %btmodel = ();
+
+while (<>){
+    chomp;
+    s/https:\/\/object\.pouta\.csc\.fi\/Tatoeba\-MT\-models\///;
+    my @fields = split(/\t/);
+    if ($fields[3]=~/^(.*)\+bt-....-..-..\.zip/){
+	unless (exists $btmodel{"$fields[0]\t$1"}){
+	    $btmodel{"$fields[0]\t$1"} = $_;
+	}
+    }
+    elsif ($fields[3]=~/^(.*)-....-..-..\.zip/){
+	unless (exists $basemodel{"$fields[0]\t$1"}){
+	    $basemodel{"$fields[0]\t$1"} = $_;
+	}
+    }
+}
+
+foreach (sort keys %btmodel){
+    if (exists $basemodel{$_} and $btmodel{$_}){
+	print "base\t", $basemodel{$_},"\n";
+	print "base+bt\t", $btmodel{$_},"\n";
+	my @base = split(/\t/,$basemodel{$_});
+	my @bt = split(/\t/,$btmodel{$_});
+	$bt[1] = sprintf("%5.3f",$bt[1] - $base[1]);
+	$bt[2] = sprintf("%5.2f",$bt[2] - $base[2]);
+	print "diff\t", join("\t",@bt),"\n\n";
+    }
+}
--- a/scripts/filter/filter-korean.sh
+++ b/scripts/filter/filter-korean.sh
@ -0,0 +1,54 @@
+#!/usr/bin/bash
+#
+# extra filtering for Korean data
+# filter out data that has characters other than Hang
+#
+# USAGE: filter-korean.sh srclangid trglangid < tab-sepatared-bitext > filtered-bitext
+#
+
+
+tmpsrc=`mktemp`
+tmptrg=`mktemp`
+tmplang=`mktemp`
+
+
+if [ "$1" == "kor" ] || [ "$1" == "ko" ]; then
+    column=1
+elif [ "$2" == "kor" ] || [ "$2" == "ko" ]; then
+    column=2
+fi
+
+## don't touch test sets
+if [ "$3" == "test" ]; then
+    column=0
+fi
+
+
+if [ $column -gt 0 ]; then
+  echo "... filter Korean bitexts" >&2
+  perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
+else 
+  cat
+fi
+
+
+## OLD: check script
+## this is slow ....
+
+# if [ $column -gt 0 ]; then
+#   echo "... filter Korean bitexts ($tmplang $tmpsrc $tmptrg)" >&2
+#   perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |
+#   tee >(cut -f1 > $tmpsrc) >(cut -f2 > $tmptrg) |
+#   cut -f$column |
+#   perl -CIOE -pe 'use utf8;s/\p{P}//g;s/[^\S\n]//g;s/▁//g;s/[0-9]//g' | 
+#   langscript -a > $tmplang
+
+#   paste $tmplang $tmpsrc $tmptrg | 
+#   grep $'Hang ([0-9]*)\s*\t' |
+#   cut -f2,3
+
+#   rm -f $tmplang $tmpsrc $tmptrg
+# else 
+#   cat
+# fi
+
--- a/scripts/preprocess-spm-multi-target.sh
+++ b/scripts/preprocess-spm-multi-target.sh
@ -93,6 +93,7 @@ else
 	-e 's/】/\]/g' \
 	-e 's/％/\%/g' |    
 	perl -C -pe  's/(?!\n)\p{C}/ /g;' |
+	perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
 	sed 's/  */ /g;s/^ *//g;s/ *$//g' |
 	${SPMENCODE} --model $3 |
 	sed "s/^/>>$2<< /"
--- a/scripts/preprocess-spm.sh
+++ b/scripts/preprocess-spm.sh
@ -53,6 +53,7 @@ sed -e 's/，/,/g' \
    -e 's/】/\]/g' \
    -e 's/％/\%/g' |    
 perl -C -pe  's/(?!\n)\p{C}/ /g;' |
+perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
 sed 's/  */ /g;s/^ *//g;s/ *$//g' |
 ${SPMENCODE} --model $2