backtranslation for Tatoeba data

2024-11-27 11:03:13 +03:00 · 2021-02-25 17:17:21 +02:00 · 2021-02-25 17:17:21 +02:00 · 6537fdea13
commit 6537fdea13
parent f81a2ad638
290 changed files with 767 additions and 15 deletions
--- a/backtranslate/Makefile
+++ b/backtranslate/Makefile
@ -40,16 +40,20 @@ MODELHOME ?= ${MODELSDIR}/${LANGPAIR}
 ##     we need the UTF8 sort order
 ## --> use bash sort and UTF8 locale
 # MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
-MODELZIP  = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
-MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
+ifneq (${wildcard ${MODELHOME}},)
+  MODELZIP  = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
+  MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
+endif

 ifeq (${MODELNAME},)
+ifneq (${wildcard ${WORKHOME/models/${LANGPAIR}}},)
  MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
  # MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
  # MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
-  MODELZIP  = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
+  MODELZIP  = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
  MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
 endif
+endif

 ## set to 1 if the model for backtranslation is a multi-target model
 ## --> need to use pre-processing script differently
@ -263,7 +267,7 @@ all-wikis-all-langs: index.html
 	done


-## aux function to pring the selected modelname
+## aux function to print the selected modelname
 .PHONY: print-modelname
 print-modelname:
 	@echo ${MODELNAME}
--- a/bt-tatoeba/Makefile
+++ b/bt-tatoeba/Makefile
@ -0,0 +1,323 @@
+#
+# backtranslate wiki data with Tatoeba-MT challenge data
+#
+# only works with sentencepiece models!
+#
+
+PWD      := ${shell pwd}
+TOOLSDIR := ${PWD}/../tools
+
+include ../lib/env.mk
+include ../lib/config.mk
+include ../lib/slurm.mk
+
+SRC ?= fin
+TRG ?= eng
+
+
+
+# TATOEBA_STORAGE     = https://object.pouta.csc.fi/Tatoeba-Challenge
+TATOEBA_STORAGE       = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
+TATOEBA_GITRAW        = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
+TATOEBA_RELEASED      = ${TATOEBA_GITRAW}/models/released-model-results.txt
+TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
+
+
+## various sources are available
+## can be general wikipedia, wikinews, wikibooks, ...
+# WIKISOURCE ?= wikipedia
+WIKISOURCE ?= wiki
+
+## split size in nr-of-lines
+## default part to be selected = aa
+SPLIT_SIZE ?= 1000000
+
+
+## maximum input length (number sentence piece segments)
+## maximum number of sentences to be translated (top N lines)
+MAX_LENGTH    ?= 100
+MAX_SENTENCES ?= ${SPLIT_SIZE}
+
+
+LANGPAIR = ${SRC}-${TRG}
+
+PWD := $(shell pwd)
+
+
+
+# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
+MODELZIP := ${shell wget -qq -O - ${TATOEBA_GITRAW}/models/released-model-results.txt |\
+		grep '^${LANGPAIR}' | head -1 | cut -f4}
+MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
+MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
+
+MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
+ifneq (${MULTI_TARGET_MODEL},0)
+  TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
+endif
+
+RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
+					grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
+
+
+WIKI_DIR     = ${PWD}/wiki
+LANGID       = ${SRC}
+PART         = aa
+OUTPUT_DIR   = ${LANGPAIR}
+WIKI_TXT     = ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.${PART}.gz
+WIKI_SRC     = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
+WIKI_PRE     = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
+WIKI_TRG     = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
+
+WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
+WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
+
+
+## all parts of this wiki
+PARTS = ${sort ${patsubst ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,%,\
+		${wildcard ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.??.gz}}}
+
+# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
+WIKISOURCES = ${sort $(patsubst %.txt.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.txt.gz})) \
+		$(patsubst %.${PART}.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.${PART}.gz}))}
+
+
+## targets for all parts of the current wiki source
+
+ALLWIKIPARTS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,${PARTS}}
+ALLWIKIPARTS_SRC = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${PARTS}}
+ALLWIKIPARTS_PRE = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${PARTS}}
+ALLWIKIPARTS_TRG = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${PARTS}}
+
+ALLWIKIPARTS_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
+ALLWIKIPARTS_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
+
+
+## all wiki sources for the selected part
+
+ALLWIKIS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/%.${PART}.gz,${WIKISOURCES}}
+ALLWIKIS_SRC = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${WIKISOURCES}}
+ALLWIKIS_PRE = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${WIKISOURCES}}
+ALLWIKIS_TRG = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${WIKISOURCES}}
+
+
+
+## don't delete translated text if the process crashes
+.PRECIOUS: ${WIKI_TRG}
+
+
+ifdef LOCAL_SCRATCH
+  TMPDIR = ${LOCAL_SCRATCH}
+endif
+
+ifeq (${shell hostname --domain 2>/dev/null},bullx)
+  LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
+		 module load nlpl-udpipe nlpl-opus &&
+endif
+
+
+.PHONY: all
+all: translate
+
+all-jobs: fetch
+	${MAKE} prepare-allwikis
+	${MAKE} translate-all-jobs
+
+all2eng:
+	for w in ${filter-out eng,${RELEASED_WIKIS}}; do \
+	  echo "make SRC=$$w TRG=eng all-jobs"; \
+	done
+
+
+.PHONY: fetch
+fetch: ${WIKI_DIR}/${SRC}
+
+.PHONY: prepare
+prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${WIKI_TXT}
+
+.PHONY: prepare-allwikis
+prepare-allwikis: ${LANGPAIR}/${MODELNAME}/decoder.yml ${ALLWIKIS_TXT}
+
+.PHONY: translate
+translate: ${WIKI_LATEST_TRG}
+	${MAKE} ${WIKI_LATEST_SRC}
+
+## translate all parts
+.PHONY: translate-all-parts
+translate-all-parts: ${ALLWIKIPARTS_LATEST_TRG}
+	${MAKE} ${ALLWIKIPARTS_LATEST_SRC}
+
+## translate all wikis and all parts
+.PHONY: translate-all
+translate-all:
+	for s in ${WIKISOURCES}; do \
+	  ${MAKE} translate-allparts; \
+	done
+
+## create jobs for translating all parts
+.PHONY: translate-all-parts-jobs
+translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
+	for p in ${PARTS}; do \
+	  ${MAKE} PART=$$p translate.submit; \
+	done
+
+## create jobs for translating all parts of all wikis
+.PHONY: translate-all-jobs
+translate-all-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
+	for s in ${WIKISOURCES}; do \
+	  ${MAKE} WIKISOURCE=$$s translate-all-parts-jobs; \
+	done
+
+
+
+
+
+
+.PHONY: print-modelinfo
+print-modelinfo:
+	@echo ${MODELNAME}
+	@echo ${MODELZIP}
+	@echo ${MODELINFO}
+	@echo "multi-target model: ${MULTI_TARGET_MODEL}"
+	@echo "target language label: ${TARGET_LANG_LABEL}"
+
+
+
+
+
+## fetch the latest model
+## ---> TODO: should we fetch from ObjectStorage instead?
+
+${LANGPAIR}/${MODELNAME}/decoder.yml:
+ifneq (${MODELZIP},)
+	mkdir -p ${dir $@}
+	wget -O ${dir $@}/model.zip ${MODELZIP}
+	cd ${dir $@} && unzip model.zip
+	rm -f ${dir $@}/model.zip
+	mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
+	sed 's#perl -C -pe.*$$#perl -C -pe  "s/(?!\\n)\\p{C}/ /g;" |#' \
+	< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
+	chmod +x ${dir $@}/preprocess.sh
+endif
+
+
+## pre-process data
+
+ifeq (${MULTI_TARGET_MODEL},1)
+  PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
+else
+  PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
+endif
+
+
+
+
+${WIKI_DIR}/${SRC}/%.${PART}.gz: | ${WIKI_DIR}/${SRC}
+	${GZCAT} ${@:.${PART}.gz=.txt.gz} |\
+	split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
+	gzip -f ${patsubst %${PART}.gz,%,$@}??
+
+
+${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}
+	echo "done!"
+
+
+${WIKI_DIR}/${SRC}:
+	mkdir -p $@
+	wget -O $@.tar ${TATOEBA_STORAGE}/${SRC}.tar
+	tar -C ${dir $@} -xf $@.tar
+	if [ -d ${WIKI_DIR}/data/${SRC} ]; then \
+	  mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\
+	  rm -f ${WIKI_DIR}/data/${SRC}/*;\
+	  rmdir ${WIKI_DIR}/data/${SRC};\
+	  rmdir ${WIKI_DIR}/data;\
+	fi
+	if [ -e ${WIKI_DIR}/${SRC}/wiki.${SRC}.aa.gz ]; then \
+	  for f in `ls ${WIKI_DIR}/${SRC}/*.${SRC}.*`; do \
+	    mv 	$$f `echo $$f | sed 's/\.${SRC}\././'`; \
+	  done \
+	fi
+	rm -f $@.tar
+
+${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/${SRC}/%.${PART}.gz
+ifneq (${MODELZIP},)
+	mkdir -p ${dir $@}
+	${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
+	${GZCAT} $< |\
+	grep -v '[<>{}]' |\
+	${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
+	perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
+	gzip -f > $@
+endif
+
+
+
+## merge SentencePiece segments in the source text
+## (Why? because we filter out some data from the original wiki text, see above)
+
+${WIKI_SRC}: ${WIKI_PRE}
+ifneq (${MODELZIP},)
+	mkdir -p ${dir $@}
+	${GZCAT} $< |\
+	sed 's/ //g;s/▁/ /g' | \
+	sed 's/^ *//;s/ *$$//' |\
+	sed 's/^>>[a-z]*<< //' |\
+	gzip -c > $@
+endif
+
+
+
+
+## overwrite the file with the latest translations
+## --> this allows multiple translation iterations
+##     without duplicating the data we want to use in MT training
+
+${WIKI_LATEST_SRC}: ${WIKI_SRC}
+	mkdir -p ${dir $@}
+	cp $< $@
+
+${WIKI_LATEST_TRG}: ${WIKI_TRG}
+	mkdir -p ${dir $@}
+	cp $< $@
+
+
+
+## translate
+
+%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
+ifneq (${MODELZIP},)
+	mkdir -p ${dir $@}
+	${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
+	${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
+		-i ${PWD}/$< \
+		-c decoder.yml \
+		-d ${MARIAN_GPUS} \
+		${MARIAN_DECODER_FLAGS} |\
+	sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
+	gzip -c > ${PWD}/$@
+#ifneq (${LANGPAIR},)
+#ifneq (${MODELNAME},)
+#	rm -fr ${LANGPAIR}/${MODELNAME}
+#endif
+#endif
+endif
+
+
+
+check-length:
+	for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
+	  s=`echo $$d | cut -f1 -d'-'`; \
+	  t=`echo $$d | cut -f2 -d'-'`; \
+	  echo "check $$d"; \
+	  for S in `ls $$d/*.$$s.gz`; do \
+	    T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
+	    echo "$$S -- $$T"; \
+	    ${GZCAT} $$S | wc -l; \
+	    ${GZCAT} $$T | wc -l; \
+	    if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
+	      echo "$$S != $$T"; \
+	    fi \
+	  done \
+	done
+
+
--- a/lib/projects/memad.mk
+++ b/lib/projects/memad.mk
@ -36,6 +36,46 @@ tatoeba-memad-bilingual:



+tatoeba-memad-dist:
+	${MAKE} TRGLANGS="${MEMAD_LANGS3}" SRCLANGS="eng" \
+		MODELTYPE=transformer-align \
+		tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
+	${MAKE} TRGLANGS="${MEMAD_LANGS3}" SRCLANGS="eng" \
+		TATOEBA_RELEASEDIR=models-memad \
+		TATOEBA_MODELSHOME=models-memad \
+		MODELTYPE=transformer-align release-tatoeba-1m
+	${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="eng" \
+		MODELTYPE=transformer-align \
+		tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
+	${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="eng" \
+		TATOEBA_RELEASEDIR=models-memad \
+		TATOEBA_MODELSHOME=models-memad \
+		MODELTYPE=transformer-align release-tatoeba-1m
+	${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="${MEMAD_LANGS3}" \
+		SKIP_LANGPAIRS="deu-deu|eng-eng|fin-fin|fra-fra|nld-nld|swe-swe" \
+		MODELTYPE=transformer-align \
+		tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
+	${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="${MEMAD_LANGS3}" \
+		SKIP_LANGPAIRS="deu-deu|eng-eng|fin-fin|fra-fra|nld-nld|swe-swe" \
+		TATOEBA_RELEASEDIR=models-memad \
+		TATOEBA_MODELSHOME=models-memad \
+		MODELTYPE=transformer-align release-tatoeba-1m
+	@for s in ${MEMAD_LANGS3}; do \
+	  for t in ${MEMAD_LANGS3}; do \
+	    if [ "$$s" != "$$t" ]; then \
+	      ${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
+			MODELTYPE=transformer-align \
+		tatoeba-multilingual-eval compare-tatoeba eval-testsets-tatoeba; \
+	      ${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
+			TATOEBA_RELEASEDIR=models-memad \
+			TATOEBA_MODELSHOME=models-memad \
+			MODELTYPE=transformer-align release-tatoeba; \
+	    fi \
+	  done \
+	done
+
+
+
 #----------------------------------------------------------------
 # fine-tuning on YLE subtitle data
 #----------------------------------------------------------------
@ -50,12 +90,22 @@ MEMAD_SUBTYPE  = FIN-SWE
 MEMAD_LANGPAIR = fin2swe
 MEMAD_TUNETASK = tune

+
 tatoeba-yletune-all: tatoeba-yletune-finswe-all tatoeba-yletune-swefin-all
 tatoeba-yletune-finswe-all: tatoeba-yletune-finswe tatoeba-yletune-fihswe \
 			tatoeba-yletune-finswh tatoeba-yletune-fihswh tatoeba-yletune-fisw
 tatoeba-yletune-swefin-all: tatoeba-yletune-swefin tatoeba-yletune-swefih \
 			tatoeba-yletune-swhfin tatoeba-yletune-swhfih tatoeba-yletune-swfi

+tatoeba-yleeval-all:
+	${MAKE} MEMAD_TUNETASK=tuneeval tatoeba-yletune-all
+
+tatoeba-yledist-all:
+	${MAKE} MEMAD_TUNETASK=tunedist \
+		TATOEBA_RELEASEDIR=models-memad-tuned \
+		TATOEBA_MODELSHOME=models-memad-tuned \
+	tatoeba-yletune-all
+

 tatoeba-yletune-finswe:
 	${MAKE} MEMAD_SUBTYPE=FIN-SWE MEMAD_LANGPAIR=fin2swe tatoeba-yletune
--- a/lib/projects/tatoeba.mk
+++ b/lib/projects/tatoeba.mk
@ -116,7 +116,8 @@ TATOEBA_DEVSET       = Tatoeba-dev
 TATOEBA_TESTSET      = Tatoeba-test
 TATOEBA_DEVSET_NAME  = Tatoeba-dev
 TATOEBA_TESTSET_NAME = Tatoeba-test
-
+TATOEBA_RELEASEDIR   = ${PWD}/models-tatoeba
+TATOEBA_MODELSHOME   = ${PWD}/models-tatoeba

 TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
 		DEVSET=${TATOEBA_DEVSET} \
@ -130,8 +131,8 @@ TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
 		TESTSIZE=10000 \
 		DEVMINSIZE=200 \
 		WORKHOME=${TATOEBA_WORK} \
-		MODELSHOME=${PWD}/models-tatoeba \
-		RELEASEDIR=${PWD}/models-tatoeba \
+		MODELSHOME=${TATOEBA_MODELSHOME} \
+		RELEASEDIR=${TATOEBA_RELEASEDIR} \
                MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
 		MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
 		ALT_MODEL_DIR=tatoeba \
@ -407,6 +408,40 @@ tatoeba-wiki2eng-macro:
 tatoeba-print-missing-wiki:
 	@echo $(filter-out ${WIKILANGS},${WIKIMACROLANGS})

+tatoeba-wiki2eng-parent:
+	for l in ${WIKIMACROLANGS}; do \
+	  if [ ! `find work-tatoeba/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \
+	    echo "check $$l-eng"; \
+	    if [ `find work-tatoeba/$$l-eng/train -name 'opus.src.clean.spm*.gz' 2>/dev/null | wc -l` -gt 0 ]; then \
+	      echo "check data size of $$l-eng"; \
+	      if [ `find work-tatoeba/$$l-eng/train -name 'opus.src.clean.spm*.gz' 2>/dev/null | xargs zcat | head -100000 | wc -l` -lt 100000 ]; then \
+		p=`langgroup -p $$l`; \
+		echo "${MAKE} SRCLANGS=$$p TRGLANGS=eng tatoeba-$${p}2eng-train-1m"; \
+	      fi \
+	    fi \
+	  fi \
+	done
+
+tatoeba-wiki2eng-done:
+	for l in ${WIKIMACROLANGS}; do \
+	  if [ `find models-tatoeba/$$l-eng -name '*.zip' 2>/dev/null | wc -l` -gt 0 ]; then \
+	    echo "model available for $$l-eng"; \
+	  elif [ `find work-tatoeba/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \
+	    echo -n "model aivailbale for $$l-eng but not released"; \
+	    if [ `find work-tatoeba/$$l-eng -name '*.eval' 2>/dev/null | wc -l` -gt 0 ]; then \
+	      echo -n ", BLEU = "; \
+	      grep BLEU work-tatoeba/$$l-eng/*eval | head -1 | cut -f3 -d' '; \
+	    elif [ ! -e work-tatoeba/$$l-eng/test/Tatoeba-test.src ]; then \
+	      echo ", missing eval file"; \
+	      echo "make TATOEBA_WORK=work-tatoeba-tmp SRCLANGS=$$l TRGLANGS=eng data-tatoeba"; \
+	    else \
+	      echo ", run 'make tatoeba-$${l}2eng-evalall'"; \
+	    fi \
+	  fi \
+	done
+
+
+
 ###########################################################################################
 # language groups
 ###########################################################################################
@ -1266,7 +1301,7 @@ KEEP_LANGIDS         = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur
 			nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
 			syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
 SKIP_LANGIDS         = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \
-			ang ara_Latn aze_Latn bul_Latn ell_Latn heb_Latn rus_Latn
+			ang ara_Latn bul_Latn ell_Latn heb_Latn rus_Latn
 SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$

 ## modify language IDs in training data to adjust them to test sets
@ -1279,7 +1314,7 @@ FIXLANGIDS = 	| sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
 		| sed 's/\_[A-Z][A-Z]//g' \
 		| sed 's/\-[a-z]*//g' \
 		| sed 's/jpn_[A-Za-z]*/jpn/g' \
-		| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;s/apc_Latn/apc/' \
+		| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;' \
 		| sed 's/kor_[A-Za-z]*/kor/g' \
 		| sed 's/nor_Latn/nor/g' \
 		| sed 's/nor/nob/g' \
@ -1336,6 +1371,7 @@ ${TATOEBA_MONO}/%.labels:
 	-tar -C $@.d -xf $@.d/train.tar
 	rm -f $@.d/train.tar
 	if [ -e $@.d/data/${LANGPAIR}/test.src ]; then \
+	  echo "........ move test files to ${dir $@}Tatoeba-test.${LANGPAIR}.clean.*"; \
 	  mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}; \
 	  mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}; \
 	  cat $@.d/data/${LANGPAIR}/test.id $(FIXLANGIDS) > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id; \
@ -1400,11 +1436,13 @@ ${TATOEBA_MONO}/%.labels:
 	        echo "extract $$s-$$t data"; \
 	        for d in dev test train; do \
 		  if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id ]; then \
+	            echo "........ make ${dir $@}Tatoeba-$$d.$$s-$$t"; \
 	            paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
 	            grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
 	            if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
+	              echo "........ compress to ${dir $@}Tatoeba-$$d.$$s-$$t.clean.*.gz"; \
 	              cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
 	              cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
 	            fi; \
@ -1415,11 +1453,13 @@ ${TATOEBA_MONO}/%.labels:
 	        echo "extract $$t-$$s data"; \
 	        for d in dev test train; do \
 		  if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id ]; then \
+	            echo "........ make ${dir $@}Tatoeba-$$d.$$t-$$s"; \
 	            paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
 		          ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
 	            grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
 	            if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
+	              echo "........ compress to ${dir $@}Tatoeba-$$d.$$t-$$s.clean.*.gz"; \
 	              cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
 	              cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
 	            fi; \
@ -1439,6 +1479,7 @@ ${TATOEBA_MONO}/%.labels:
 	for d in dev test train; do \
 	  if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} ]; then \
 	    if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
+	      echo "........... compress ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}"; \
 	      ${GZIP} ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
 	    else \
 	      rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
@ -1446,6 +1487,7 @@ ${TATOEBA_MONO}/%.labels:
 	  fi; \
 	  if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} ]; then \
 	    if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.gz ]; then \
+	      echo "........... compress ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}"; \
 	      ${GZIP} ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
 	    else \
 	      rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
@ -1482,6 +1524,7 @@ ${TATOEBA_MONO}/%.labels:



+
 test-tune-data: 
 	make SRCEXT=bre TRGEXT=eng LANGPAIR=bre-eng \
 	 work-tatoeba-test/data/simple/Tatoeba-OpenSubtitles-train.bre-eng.clean.bre.gz
@ -1964,3 +2007,30 @@ fixlabels.sh:
 	    fi \
 	  fi \
 	done 
+
+
+tatoeba-missing-test:
+	for d in `find work-tatoeba/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \
+	  if [ ! -e work-tatoeba/$$d/test/Tatoeba-test.src ]; then \
+	    if [ `find work-tatoeba/$$d/train -name '*-model' | wc -l` -gt 0 ]; then \
+	      p=`echo $$d | sed 's/-/2/'`; \
+	      echo "missing eval file for $$d"; \
+	      mkdir -p work-tatoeba-tmp/$$d/train; \
+	      rsync -av work-tatoeba/$$d/train/*model* work-tatoeba-tmp/$$d/train/; \
+	      make FIT_DATA_SIZE=1000 LANGGROUP_FIT_DATA_SIZE=1000 TATOEBA_WORK=work-tatoeba-tmp tatoeba-$$p-data; \
+	      cp work-tatoeba-tmp/$$d/test/Tatoeba-test.* work-tatoeba/$$d/test/; \
+	      rm -fr work-tatoeba-tmp/$$d; \
+	    fi \
+	  fi \
+	done
+
+
+tatoeba-touch-test:
+	for d in `find work-tatoeba/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \
+	  if [ -e work-tatoeba/$$d/test/Tatoeba-test.src ]; then \
+	    if [ -e work-tatoeba/$$d/val/Tatoeba-dev.src ]; then \
+	      touch -r work-tatoeba/$$d/val/Tatoeba-dev.src work-tatoeba/$$d/test/Tatoeba-test.src*; \
+	      touch -r work-tatoeba/$$d/val/Tatoeba-dev.src work-tatoeba/$$d/test/Tatoeba-test.trg*; \
+	    fi \
+	  fi \
+	done
--- a/lib/slurm.mk
+++ b/lib/slurm.mk
@ -61,9 +61,9 @@ endif
 %.submitcpu:
 	mkdir -p ${WORKDIR}
 	echo '#!/bin/bash -l' > $@
-	echo '#SBATCH -J "${LANGPAIRSTR}${@:.submitcpu=}"' >>$@
-	echo '#SBATCH -o ${LANGPAIRSTR}${@:.submitcpu=}.out.%j' >> $@
-	echo '#SBATCH -e ${LANGPAIRSTR}${@:.submitcpu=}.err.%j' >> $@
+	echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submitcpu=}"' >>$@
+	echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.out.%j' >> $@
+	echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.err.%j' >> $@
 	echo '#SBATCH --mem=${HPC_MEM}' >> $@
 ifdef EMAIL
 	echo '#SBATCH --mail-type=END' >> $@
--- a/scripts/preprocess-spm-multi-target.sh
+++ b/scripts/preprocess-spm-multi-target.sh
@ -2,8 +2,15 @@
 #
 # USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
 #
-#
 # replace SPMENCODE with your own setup! 
+#
+# CHANGES
+#
+#  * issue with perl code that removes control characters
+#    unicode property Other = \p{C}) seems to remove 
+#    newline characters as well --> add negative lookahead
+#    to avoid removing newline characters!
+

 if [ `hostname -d` == "bullx" ]; then
  APPLHOME=/projappl/project_2001569
@ -90,7 +97,7 @@ else
 	-e 's/【/\[/g' \
 	-e 's/】/\]/g' \
 	-e 's/％/\%/g' |    
-	perl -C -pe 's/\p{C}/ /g;' |
+	perl -C -pe  's/(?!\n)\p{C}/ /g;'
 	sed 's/  */ /g;s/^ *//g;s/ *$//g' |
 	${SPMENCODE} --model $3 |
 	sed "s/^/>>$2<< /"
--- a/scripts/preprocess-spm.sh
+++ b/scripts/preprocess-spm.sh
@ -3,6 +3,14 @@
 # USAGE preprocess.sh langid spmodel < input > output
 #
 # replace SPMENCODE with your own setup! 
+#
+# CHANGES
+#
+#  * issue with perl code that removes control characters
+#    unicode property Other = \p{C}) seems to remove 
+#    newline characters as well --> add negative lookahead
+#    to avoid removing newline characters!
+#

 if [ `hostname -d` == "bullx" ]; then
  APPLHOME=/projappl/project_2001569
@ -49,7 +57,7 @@ sed -e 's/，/,/g' \
    -e 's/【/\[/g' \
    -e 's/】/\]/g' \
    -e 's/％/\%/g' |    
-perl -C -pe 's/\p{C}/ /g;' |
+perl -C -pe  's/(?!\n)\p{C}/ /g;'
 sed 's/  */ /g;s/^ *//g;s/ *$//g' |
 ${SPMENCODE} --model $2

--- a/testsets/Makefile
+++ b/testsets/Makefile
@ -30,6 +30,18 @@ ${TICO19_TEST}: %/tico19-test.en.gz: tico19-testset/test/test.%.tsv
 	cut -f4 $< | tail -n +2 | sed 's/^ *//;s/ *$$//' | gzip -c > ${@:en.gz=${patsubst en-%/,%,$(dir $@)}}.gz


+TICODATADIRS = $(sort $(subst /,,${dir ${wildcard */tico19-test.*}}))
+crosslink-tico:
+	-for d in ${TICODATADIRS}; do \
+	  s=`echo "$$d" | cut -f1 -d'-'`; \
+	  t=`echo "$$d" | cut -f2 -d'-'`; \
+	  mkdir -p $$t-$$s; \
+	  cd $$t-$$s; \
+	  ln -s ../$$d/tico19* .; \
+	  cd ..; \
+	done
+
+



--- a/testsets/am-en/tico19-test.am.gz
+++ b/testsets/am-en/tico19-test.am.gz
@ -0,0 +1 @@
+../en-am/tico19-test.am.gz
--- a/testsets/am-en/tico19-test.amh.gz
+++ b/testsets/am-en/tico19-test.amh.gz
@ -0,0 +1 @@
+../en-am/tico19-test.amh.gz
--- a/testsets/am-en/tico19-test.en.gz
+++ b/testsets/am-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-am/tico19-test.en.gz
--- a/testsets/am-en/tico19-test.eng.gz
+++ b/testsets/am-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-am/tico19-test.eng.gz
--- a/testsets/amh-eng/tico19-test.am.gz
+++ b/testsets/amh-eng/tico19-test.am.gz
@ -0,0 +1 @@
+../eng-amh/tico19-test.am.gz
--- a/testsets/amh-eng/tico19-test.amh.gz
+++ b/testsets/amh-eng/tico19-test.amh.gz
@ -0,0 +1 @@
+../eng-amh/tico19-test.amh.gz
--- a/testsets/amh-eng/tico19-test.en.gz
+++ b/testsets/amh-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-amh/tico19-test.en.gz
--- a/testsets/amh-eng/tico19-test.eng.gz
+++ b/testsets/amh-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-amh/tico19-test.eng.gz
--- a/testsets/ar-en/tico19-test.ar.gz
+++ b/testsets/ar-en/tico19-test.ar.gz
@ -0,0 +1 @@
+../en-ar/tico19-test.ar.gz
--- a/testsets/ar-en/tico19-test.ara.gz
+++ b/testsets/ar-en/tico19-test.ara.gz
@ -0,0 +1 @@
+../en-ar/tico19-test.ara.gz
--- a/testsets/ar-en/tico19-test.en.gz
+++ b/testsets/ar-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-ar/tico19-test.en.gz
--- a/testsets/ar-en/tico19-test.eng.gz
+++ b/testsets/ar-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-ar/tico19-test.eng.gz
--- a/testsets/ara-eng/tico19-test.ar.gz
+++ b/testsets/ara-eng/tico19-test.ar.gz
@ -0,0 +1 @@
+../eng-ara/tico19-test.ar.gz
--- a/testsets/ara-eng/tico19-test.ara.gz
+++ b/testsets/ara-eng/tico19-test.ara.gz
@ -0,0 +1 @@
+../eng-ara/tico19-test.ara.gz
--- a/testsets/ara-eng/tico19-test.en.gz
+++ b/testsets/ara-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-ara/tico19-test.en.gz
--- a/testsets/ara-eng/tico19-test.eng.gz
+++ b/testsets/ara-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-ara/tico19-test.eng.gz
--- a/testsets/ben-eng/tico19-test.ben.gz
+++ b/testsets/ben-eng/tico19-test.ben.gz
@ -0,0 +1 @@
+../eng-ben/tico19-test.ben.gz
--- a/testsets/ben-eng/tico19-test.bn.gz
+++ b/testsets/ben-eng/tico19-test.bn.gz
@ -0,0 +1 @@
+../eng-ben/tico19-test.bn.gz
--- a/testsets/ben-eng/tico19-test.en.gz
+++ b/testsets/ben-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-ben/tico19-test.en.gz
--- a/testsets/ben-eng/tico19-test.eng.gz
+++ b/testsets/ben-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-ben/tico19-test.eng.gz
--- a/testsets/bn-en/tico19-test.ben.gz
+++ b/testsets/bn-en/tico19-test.ben.gz
@ -0,0 +1 @@
+../en-bn/tico19-test.ben.gz
--- a/testsets/bn-en/tico19-test.bn.gz
+++ b/testsets/bn-en/tico19-test.bn.gz
@ -0,0 +1 @@
+../en-bn/tico19-test.bn.gz
--- a/testsets/bn-en/tico19-test.en.gz
+++ b/testsets/bn-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-bn/tico19-test.en.gz
--- a/testsets/bn-en/tico19-test.eng.gz
+++ b/testsets/bn-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-bn/tico19-test.eng.gz
--- a/testsets/en-es/tico19-test.en.gz
+++ b/testsets/en-es/tico19-test.en.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.en.gz
--- a/testsets/en-es/tico19-test.eng.gz
+++ b/testsets/en-es/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.eng.gz
--- a/testsets/en-es/tico19-test.es.gz
+++ b/testsets/en-es/tico19-test.es.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.es-LA.gz
--- a/testsets/en-es/tico19-test.spa.gz
+++ b/testsets/en-es/tico19-test.spa.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.spa.gz
--- a/testsets/en-es_LA/tico19-test.en.gz
+++ b/testsets/en-es_LA/tico19-test.en.gz
--- a/testsets/en-es_LA/tico19-test.eng.gz
+++ b/testsets/en-es_LA/tico19-test.eng.gz
@ -0,0 +1 @@
+tico19-test.en.gz
--- a/testsets/en-es_LA/tico19-test.es-LA.gz
+++ b/testsets/en-es_LA/tico19-test.es-LA.gz
--- a/testsets/en-es_LA/tico19-test.spa.gz
+++ b/testsets/en-es_LA/tico19-test.spa.gz
@ -0,0 +1 @@
+tico19-test.es-LA.gz
--- a/testsets/en-pt/tico19-test.en.gz
+++ b/testsets/en-pt/tico19-test.en.gz
@ -0,0 +1 @@
+../en-pt_BR/tico19-test.en.gz
--- a/testsets/en-pt/tico19-test.eng.gz
+++ b/testsets/en-pt/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-pt_BR/tico19-test.en.gz
--- a/testsets/en-pt/tico19-test.por.gz
+++ b/testsets/en-pt/tico19-test.por.gz
@ -0,0 +1 @@
+../en-pt_BR/tico19-test.pt-BR.gz
--- a/testsets/en-pt/tico19-test.pt.gz
+++ b/testsets/en-pt/tico19-test.pt.gz
@ -0,0 +1 @@
+../en-pt_BR/tico19-test.pt-BR.gz
--- a/testsets/en-pt_BR/tico19-test.en.gz
+++ b/testsets/en-pt_BR/tico19-test.en.gz
--- a/testsets/en-pt_BR/tico19-test.eng.gz
+++ b/testsets/en-pt_BR/tico19-test.eng.gz
@ -0,0 +1 @@
+tico19-test.en.gz
--- a/testsets/en-pt_BR/tico19-test.pob.gz
+++ b/testsets/en-pt_BR/tico19-test.pob.gz
@ -0,0 +1 @@
+tico19-test.pt-BR.gz
--- a/testsets/en-pt_BR/tico19-test.pt-BR.gz
+++ b/testsets/en-pt_BR/tico19-test.pt-BR.gz
--- a/testsets/en-ti/tico19-test.en-ti_ER.en.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ER.en.gz
@ -0,0 +1 @@
+../en-ti_ER/tico19-test.en.gz
--- a/testsets/en-ti/tico19-test.en-ti_ER.eng.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ER.eng.gz
@ -0,0 +1 @@
+tico19-test.en-ti_ER.en.gz
--- a/testsets/en-ti/tico19-test.en-ti_ER.ti.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ER.ti.gz
@ -0,0 +1 @@
+../en-ti_ER/tico19-test.ti_ER.gz
--- a/testsets/en-ti/tico19-test.en-ti_ER.tir.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ER.tir.gz
@ -0,0 +1 @@
+tico19-test.en-ti_ER.ti.gz
--- a/testsets/en-ti/tico19-test.en-ti_ET.en.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ET.en.gz
@ -0,0 +1 @@
+../en-ti_ET/tico19-test.en.gz
--- a/testsets/en-ti/tico19-test.en-ti_ET.eng.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ET.eng.gz
@ -0,0 +1 @@
+tico19-test.en-ti_ET.en.gz
--- a/testsets/en-ti/tico19-test.en-ti_ET.ti.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ET.ti.gz
@ -0,0 +1 @@
+../en-ti_ET/tico19-test.ti_ET.gz
--- a/testsets/en-ti/tico19-test.en-ti_ET.tir.gz
+++ b/testsets/en-ti/tico19-test.en-ti_ET.tir.gz
@ -0,0 +1 @@
+tico19-test.en-ti_ET.ti.gz
--- a/testsets/eng-pob
+++ b/testsets/eng-pob
@ -0,0 +1 @@
+en-pt_BR
--- a/testsets/eng-por
+++ b/testsets/eng-por
@ -0,0 +1 @@
+en-pt
--- a/testsets/es-en/tico19-test.en.gz
+++ b/testsets/es-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-es/tico19-test.en.gz
--- a/testsets/es-en/tico19-test.eng.gz
+++ b/testsets/es-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-es/tico19-test.eng.gz
--- a/testsets/es-en/tico19-test.es.gz
+++ b/testsets/es-en/tico19-test.es.gz
@ -0,0 +1 @@
+../en-es/tico19-test.es.gz
--- a/testsets/es-en/tico19-test.spa.gz
+++ b/testsets/es-en/tico19-test.spa.gz
@ -0,0 +1 @@
+../en-es/tico19-test.spa.gz
--- a/testsets/es_LA-en/tico19-test.en.gz
+++ b/testsets/es_LA-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.en.gz
--- a/testsets/es_LA-en/tico19-test.eng.gz
+++ b/testsets/es_LA-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.eng.gz
--- a/testsets/es_LA-en/tico19-test.es-LA.gz
+++ b/testsets/es_LA-en/tico19-test.es-LA.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.es-LA.gz
--- a/testsets/es_LA-en/tico19-test.spa.gz
+++ b/testsets/es_LA-en/tico19-test.spa.gz
@ -0,0 +1 @@
+../en-es_LA/tico19-test.spa.gz
--- a/testsets/fa-en/tico19-test.en.gz
+++ b/testsets/fa-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-fa/tico19-test.en.gz
--- a/testsets/fa-en/tico19-test.eng.gz
+++ b/testsets/fa-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-fa/tico19-test.eng.gz
--- a/testsets/fa-en/tico19-test.fa.gz
+++ b/testsets/fa-en/tico19-test.fa.gz
@ -0,0 +1 @@
+../en-fa/tico19-test.fa.gz
--- a/testsets/fa-en/tico19-test.fas.gz
+++ b/testsets/fa-en/tico19-test.fas.gz
@ -0,0 +1 @@
+../en-fa/tico19-test.fas.gz
--- a/testsets/fas-eng/tico19-test.en.gz
+++ b/testsets/fas-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-fas/tico19-test.en.gz
--- a/testsets/fas-eng/tico19-test.eng.gz
+++ b/testsets/fas-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-fas/tico19-test.eng.gz
--- a/testsets/fas-eng/tico19-test.fa.gz
+++ b/testsets/fas-eng/tico19-test.fa.gz
@ -0,0 +1 @@
+../eng-fas/tico19-test.fa.gz
--- a/testsets/fas-eng/tico19-test.fas.gz
+++ b/testsets/fas-eng/tico19-test.fas.gz
@ -0,0 +1 @@
+../eng-fas/tico19-test.fas.gz
--- a/testsets/fr-en/tico19-test.en.gz
+++ b/testsets/fr-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-fr/tico19-test.en.gz
--- a/testsets/fr-en/tico19-test.eng.gz
+++ b/testsets/fr-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-fr/tico19-test.eng.gz
--- a/testsets/fr-en/tico19-test.fr.gz
+++ b/testsets/fr-en/tico19-test.fr.gz
@ -0,0 +1 @@
+../en-fr/tico19-test.fr.gz
--- a/testsets/fr-en/tico19-test.fra.gz
+++ b/testsets/fr-en/tico19-test.fra.gz
@ -0,0 +1 @@
+../en-fr/tico19-test.fra.gz
--- a/testsets/ha-en/tico19-test.en.gz
+++ b/testsets/ha-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-ha/tico19-test.en.gz
--- a/testsets/ha-en/tico19-test.eng.gz
+++ b/testsets/ha-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-ha/tico19-test.eng.gz
--- a/testsets/ha-en/tico19-test.ha.gz
+++ b/testsets/ha-en/tico19-test.ha.gz
@ -0,0 +1 @@
+../en-ha/tico19-test.ha.gz
--- a/testsets/ha-en/tico19-test.hau.gz
+++ b/testsets/ha-en/tico19-test.hau.gz
@ -0,0 +1 @@
+../en-ha/tico19-test.hau.gz
--- a/testsets/hau-eng/tico19-test.en.gz
+++ b/testsets/hau-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-hau/tico19-test.en.gz
--- a/testsets/hau-eng/tico19-test.eng.gz
+++ b/testsets/hau-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-hau/tico19-test.eng.gz
--- a/testsets/hau-eng/tico19-test.ha.gz
+++ b/testsets/hau-eng/tico19-test.ha.gz
@ -0,0 +1 @@
+../eng-hau/tico19-test.ha.gz
--- a/testsets/hau-eng/tico19-test.hau.gz
+++ b/testsets/hau-eng/tico19-test.hau.gz
@ -0,0 +1 @@
+../eng-hau/tico19-test.hau.gz
--- a/testsets/hi-en/tico19-test.en.gz
+++ b/testsets/hi-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-hi/tico19-test.en.gz
--- a/testsets/hi-en/tico19-test.eng.gz
+++ b/testsets/hi-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-hi/tico19-test.eng.gz
--- a/testsets/hi-en/tico19-test.hi.gz
+++ b/testsets/hi-en/tico19-test.hi.gz
@ -0,0 +1 @@
+../en-hi/tico19-test.hi.gz
--- a/testsets/hi-en/tico19-test.hin.gz
+++ b/testsets/hi-en/tico19-test.hin.gz
@ -0,0 +1 @@
+../en-hi/tico19-test.hin.gz
--- a/testsets/id-en/tico19-test.en.gz
+++ b/testsets/id-en/tico19-test.en.gz
@ -0,0 +1 @@
+../en-id/tico19-test.en.gz
--- a/testsets/id-en/tico19-test.eng.gz
+++ b/testsets/id-en/tico19-test.eng.gz
@ -0,0 +1 @@
+../en-id/tico19-test.eng.gz
--- a/testsets/id-en/tico19-test.id.gz
+++ b/testsets/id-en/tico19-test.id.gz
@ -0,0 +1 @@
+../en-id/tico19-test.id.gz
--- a/testsets/id-en/tico19-test.msa.gz
+++ b/testsets/id-en/tico19-test.msa.gz
@ -0,0 +1 @@
+../en-id/tico19-test.msa.gz
--- a/testsets/kau-eng/tico19-test.en.gz
+++ b/testsets/kau-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-kau/tico19-test.en.gz
--- a/testsets/kau-eng/tico19-test.eng.gz
+++ b/testsets/kau-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-kau/tico19-test.eng.gz
--- a/testsets/kau-eng/tico19-test.kau.gz
+++ b/testsets/kau-eng/tico19-test.kau.gz
@ -0,0 +1 @@
+../eng-kau/tico19-test.kau.gz
--- a/testsets/kau-eng/tico19-test.kr.gz
+++ b/testsets/kau-eng/tico19-test.kr.gz
@ -0,0 +1 @@
+../eng-kau/tico19-test.kr.gz
--- a/testsets/khm-eng/tico19-test.en.gz
+++ b/testsets/khm-eng/tico19-test.en.gz
@ -0,0 +1 @@
+../eng-khm/tico19-test.en.gz
--- a/testsets/khm-eng/tico19-test.eng.gz
+++ b/testsets/khm-eng/tico19-test.eng.gz
@ -0,0 +1 @@
+../eng-khm/tico19-test.eng.gz
--- a/Show More
+++ b/Show More