backtranslation for Tatoeba data

This commit is contained in:
Joerg Tiedemann 2021-02-25 17:17:21 +02:00
parent f81a2ad638
commit 6537fdea13
290 changed files with 767 additions and 15 deletions

View File

@ -40,16 +40,20 @@ MODELHOME ?= ${MODELSDIR}/${LANGPAIR}
## we need the UTF8 sort order
## --> use bash sort and UTF8 locale
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifneq (${wildcard ${MODELHOME}},)
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
ifeq (${MODELNAME},)
ifneq (${wildcard ${WORKHOME/models/${LANGPAIR}}},)
MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
endif
## set to 1 if the model for backtranslation is a multi-target model
## --> need to use pre-processing script differently
@ -263,7 +267,7 @@ all-wikis-all-langs: index.html
done
## aux function to pring the selected modelname
## aux function to print the selected modelname
.PHONY: print-modelname
print-modelname:
@echo ${MODELNAME}

323
bt-tatoeba/Makefile Normal file
View File

@ -0,0 +1,323 @@
#
# backtranslate wiki data with Tatoeba-MT challenge data
#
# only works with sentencepiece models!
#
PWD := ${shell pwd}
TOOLSDIR := ${PWD}/../tools
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
SRC ?= fin
TRG ?= eng
# TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
# WIKISOURCE ?= wikipedia
WIKISOURCE ?= wiki
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH ?= 100
MAX_SENTENCES ?= ${SPLIT_SIZE}
LANGPAIR = ${SRC}-${TRG}
PWD := $(shell pwd)
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_GITRAW}/models/released-model-results.txt |\
grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
endif
RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
WIKI_DIR = ${PWD}/wiki
LANGID = ${SRC}
PART = aa
OUTPUT_DIR = ${LANGPAIR}
WIKI_TXT = ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.${PART}.gz
WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,%,\
${wildcard ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.??.gz}}}
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = ${sort $(patsubst %.txt.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.txt.gz})) \
$(patsubst %.${PART}.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.${PART}.gz}))}
## targets for all parts of the current wiki source
ALLWIKIPARTS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,${PARTS}}
ALLWIKIPARTS_SRC = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${PARTS}}
ALLWIKIPARTS_PRE = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${PARTS}}
ALLWIKIPARTS_TRG = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${PARTS}}
ALLWIKIPARTS_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
ALLWIKIPARTS_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
## all wiki sources for the selected part
ALLWIKIS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/%.${PART}.gz,${WIKISOURCES}}
ALLWIKIS_SRC = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${WIKISOURCES}}
ALLWIKIS_PRE = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${WIKISOURCES}}
ALLWIKIS_TRG = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${WIKISOURCES}}
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
ifeq (${shell hostname --domain 2>/dev/null},bullx)
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
endif
.PHONY: all
all: translate
all-jobs: fetch
${MAKE} prepare-allwikis
${MAKE} translate-all-jobs
all2eng:
for w in ${filter-out eng,${RELEASED_WIKIS}}; do \
echo "make SRC=$$w TRG=eng all-jobs"; \
done
.PHONY: fetch
fetch: ${WIKI_DIR}/${SRC}
.PHONY: prepare
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${WIKI_TXT}
.PHONY: prepare-allwikis
prepare-allwikis: ${LANGPAIR}/${MODELNAME}/decoder.yml ${ALLWIKIS_TXT}
.PHONY: translate
translate: ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_SRC}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALLWIKIPARTS_LATEST_TRG}
${MAKE} ${ALLWIKIPARTS_LATEST_SRC}
## translate all wikis and all parts
.PHONY: translate-all
translate-all:
for s in ${WIKISOURCES}; do \
${MAKE} translate-allparts; \
done
## create jobs for translating all parts
.PHONY: translate-all-parts-jobs
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
${MAKE} PART=$$p translate.submit; \
done
## create jobs for translating all parts of all wikis
.PHONY: translate-all-jobs
translate-all-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for s in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$s translate-all-parts-jobs; \
done
.PHONY: print-modelinfo
print-modelinfo:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo ${MODELINFO}
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
@echo "target language label: ${TARGET_LANG_LABEL}"
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
chmod +x ${dir $@}/preprocess.sh
endif
## pre-process data
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
else
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif
${WIKI_DIR}/${SRC}/%.${PART}.gz: | ${WIKI_DIR}/${SRC}
${GZCAT} ${@:.${PART}.gz=.txt.gz} |\
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}??
${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}
echo "done!"
${WIKI_DIR}/${SRC}:
mkdir -p $@
wget -O $@.tar ${TATOEBA_STORAGE}/${SRC}.tar
tar -C ${dir $@} -xf $@.tar
if [ -d ${WIKI_DIR}/data/${SRC} ]; then \
mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\
rm -f ${WIKI_DIR}/data/${SRC}/*;\
rmdir ${WIKI_DIR}/data/${SRC};\
rmdir ${WIKI_DIR}/data;\
fi
if [ -e ${WIKI_DIR}/${SRC}/wiki.${SRC}.aa.gz ]; then \
for f in `ls ${WIKI_DIR}/${SRC}/*.${SRC}.*`; do \
mv $$f `echo $$f | sed 's/\.${SRC}\././'`; \
done \
fi
rm -f $@.tar
${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/${SRC}/%.${PART}.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${GZCAT} $< |\
grep -v '[<>{}]' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
gzip -f > $@
endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@
endif
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${WIKI_LATEST_SRC}: ${WIKI_SRC}
mkdir -p ${dir $@}
cp $< $@
${WIKI_LATEST_TRG}: ${WIKI_TRG}
mkdir -p ${dir $@}
cp $< $@
## translate
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
-i ${PWD}/$< \
-c decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
#ifneq (${LANGPAIR},)
#ifneq (${MODELNAME},)
# rm -fr ${LANGPAIR}/${MODELNAME}
#endif
#endif
endif
check-length:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
echo "check $$d"; \
for S in `ls $$d/*.$$s.gz`; do \
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
echo "$$S -- $$T"; \
${GZCAT} $$S | wc -l; \
${GZCAT} $$T | wc -l; \
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
echo "$$S != $$T"; \
fi \
done \
done

View File

@ -36,6 +36,46 @@ tatoeba-memad-bilingual:
tatoeba-memad-dist:
${MAKE} TRGLANGS="${MEMAD_LANGS3}" SRCLANGS="eng" \
MODELTYPE=transformer-align \
tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
${MAKE} TRGLANGS="${MEMAD_LANGS3}" SRCLANGS="eng" \
TATOEBA_RELEASEDIR=models-memad \
TATOEBA_MODELSHOME=models-memad \
MODELTYPE=transformer-align release-tatoeba-1m
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="eng" \
MODELTYPE=transformer-align \
tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="eng" \
TATOEBA_RELEASEDIR=models-memad \
TATOEBA_MODELSHOME=models-memad \
MODELTYPE=transformer-align release-tatoeba-1m
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="${MEMAD_LANGS3}" \
SKIP_LANGPAIRS="deu-deu|eng-eng|fin-fin|fra-fra|nld-nld|swe-swe" \
MODELTYPE=transformer-align \
tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="${MEMAD_LANGS3}" \
SKIP_LANGPAIRS="deu-deu|eng-eng|fin-fin|fra-fra|nld-nld|swe-swe" \
TATOEBA_RELEASEDIR=models-memad \
TATOEBA_MODELSHOME=models-memad \
MODELTYPE=transformer-align release-tatoeba-1m
@for s in ${MEMAD_LANGS3}; do \
for t in ${MEMAD_LANGS3}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
MODELTYPE=transformer-align \
tatoeba-multilingual-eval compare-tatoeba eval-testsets-tatoeba; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
TATOEBA_RELEASEDIR=models-memad \
TATOEBA_MODELSHOME=models-memad \
MODELTYPE=transformer-align release-tatoeba; \
fi \
done \
done
#----------------------------------------------------------------
# fine-tuning on YLE subtitle data
#----------------------------------------------------------------
@ -50,12 +90,22 @@ MEMAD_SUBTYPE = FIN-SWE
MEMAD_LANGPAIR = fin2swe
MEMAD_TUNETASK = tune
tatoeba-yletune-all: tatoeba-yletune-finswe-all tatoeba-yletune-swefin-all
tatoeba-yletune-finswe-all: tatoeba-yletune-finswe tatoeba-yletune-fihswe \
tatoeba-yletune-finswh tatoeba-yletune-fihswh tatoeba-yletune-fisw
tatoeba-yletune-swefin-all: tatoeba-yletune-swefin tatoeba-yletune-swefih \
tatoeba-yletune-swhfin tatoeba-yletune-swhfih tatoeba-yletune-swfi
tatoeba-yleeval-all:
${MAKE} MEMAD_TUNETASK=tuneeval tatoeba-yletune-all
tatoeba-yledist-all:
${MAKE} MEMAD_TUNETASK=tunedist \
TATOEBA_RELEASEDIR=models-memad-tuned \
TATOEBA_MODELSHOME=models-memad-tuned \
tatoeba-yletune-all
tatoeba-yletune-finswe:
${MAKE} MEMAD_SUBTYPE=FIN-SWE MEMAD_LANGPAIR=fin2swe tatoeba-yletune

View File

@ -116,7 +116,8 @@ TATOEBA_DEVSET = Tatoeba-dev
TATOEBA_TESTSET = Tatoeba-test
TATOEBA_DEVSET_NAME = Tatoeba-dev
TATOEBA_TESTSET_NAME = Tatoeba-test
TATOEBA_RELEASEDIR = ${PWD}/models-tatoeba
TATOEBA_MODELSHOME = ${PWD}/models-tatoeba
TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
DEVSET=${TATOEBA_DEVSET} \
@ -130,8 +131,8 @@ TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
TESTSIZE=10000 \
DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \
MODELSHOME=${PWD}/models-tatoeba \
RELEASEDIR=${PWD}/models-tatoeba \
MODELSHOME=${TATOEBA_MODELSHOME} \
RELEASEDIR=${TATOEBA_RELEASEDIR} \
MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
ALT_MODEL_DIR=tatoeba \
@ -407,6 +408,40 @@ tatoeba-wiki2eng-macro:
tatoeba-print-missing-wiki:
@echo $(filter-out ${WIKILANGS},${WIKIMACROLANGS})
tatoeba-wiki2eng-parent:
for l in ${WIKIMACROLANGS}; do \
if [ ! `find work-tatoeba/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \
echo "check $$l-eng"; \
if [ `find work-tatoeba/$$l-eng/train -name 'opus.src.clean.spm*.gz' 2>/dev/null | wc -l` -gt 0 ]; then \
echo "check data size of $$l-eng"; \
if [ `find work-tatoeba/$$l-eng/train -name 'opus.src.clean.spm*.gz' 2>/dev/null | xargs zcat | head -100000 | wc -l` -lt 100000 ]; then \
p=`langgroup -p $$l`; \
echo "${MAKE} SRCLANGS=$$p TRGLANGS=eng tatoeba-$${p}2eng-train-1m"; \
fi \
fi \
fi \
done
tatoeba-wiki2eng-done:
for l in ${WIKIMACROLANGS}; do \
if [ `find models-tatoeba/$$l-eng -name '*.zip' 2>/dev/null | wc -l` -gt 0 ]; then \
echo "model available for $$l-eng"; \
elif [ `find work-tatoeba/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \
echo -n "model aivailbale for $$l-eng but not released"; \
if [ `find work-tatoeba/$$l-eng -name '*.eval' 2>/dev/null | wc -l` -gt 0 ]; then \
echo -n ", BLEU = "; \
grep BLEU work-tatoeba/$$l-eng/*eval | head -1 | cut -f3 -d' '; \
elif [ ! -e work-tatoeba/$$l-eng/test/Tatoeba-test.src ]; then \
echo ", missing eval file"; \
echo "make TATOEBA_WORK=work-tatoeba-tmp SRCLANGS=$$l TRGLANGS=eng data-tatoeba"; \
else \
echo ", run 'make tatoeba-$${l}2eng-evalall'"; \
fi \
fi \
done
###########################################################################################
# language groups
###########################################################################################
@ -1266,7 +1301,7 @@ KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur
nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \
ang ara_Latn aze_Latn bul_Latn ell_Latn heb_Latn rus_Latn
ang ara_Latn bul_Latn ell_Latn heb_Latn rus_Latn
SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
## modify language IDs in training data to adjust them to test sets
@ -1279,7 +1314,7 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
| sed 's/\_[A-Z][A-Z]//g' \
| sed 's/\-[a-z]*//g' \
| sed 's/jpn_[A-Za-z]*/jpn/g' \
| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;s/apc_Latn/apc/' \
| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;' \
| sed 's/kor_[A-Za-z]*/kor/g' \
| sed 's/nor_Latn/nor/g' \
| sed 's/nor/nob/g' \
@ -1336,6 +1371,7 @@ ${TATOEBA_MONO}/%.labels:
-tar -C $@.d -xf $@.d/train.tar
rm -f $@.d/train.tar
if [ -e $@.d/data/${LANGPAIR}/test.src ]; then \
echo "........ move test files to ${dir $@}Tatoeba-test.${LANGPAIR}.clean.*"; \
mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}; \
mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}; \
cat $@.d/data/${LANGPAIR}/test.id $(FIXLANGIDS) > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id; \
@ -1400,11 +1436,13 @@ ${TATOEBA_MONO}/%.labels:
echo "extract $$s-$$t data"; \
for d in dev test train; do \
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id ]; then \
echo "........ make ${dir $@}Tatoeba-$$d.$$s-$$t"; \
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
echo "........ compress to ${dir $@}Tatoeba-$$d.$$s-$$t.clean.*.gz"; \
cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
fi; \
@ -1415,11 +1453,13 @@ ${TATOEBA_MONO}/%.labels:
echo "extract $$t-$$s data"; \
for d in dev test train; do \
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id ]; then \
echo "........ make ${dir $@}Tatoeba-$$d.$$t-$$s"; \
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
echo "........ compress to ${dir $@}Tatoeba-$$d.$$t-$$s.clean.*.gz"; \
cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
fi; \
@ -1439,6 +1479,7 @@ ${TATOEBA_MONO}/%.labels:
for d in dev test train; do \
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} ]; then \
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
echo "........... compress ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}"; \
${GZIP} ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
else \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
@ -1446,6 +1487,7 @@ ${TATOEBA_MONO}/%.labels:
fi; \
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} ]; then \
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.gz ]; then \
echo "........... compress ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}"; \
${GZIP} ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
else \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
@ -1482,6 +1524,7 @@ ${TATOEBA_MONO}/%.labels:
test-tune-data:
make SRCEXT=bre TRGEXT=eng LANGPAIR=bre-eng \
work-tatoeba-test/data/simple/Tatoeba-OpenSubtitles-train.bre-eng.clean.bre.gz
@ -1964,3 +2007,30 @@ fixlabels.sh:
fi \
fi \
done
tatoeba-missing-test:
for d in `find work-tatoeba/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \
if [ ! -e work-tatoeba/$$d/test/Tatoeba-test.src ]; then \
if [ `find work-tatoeba/$$d/train -name '*-model' | wc -l` -gt 0 ]; then \
p=`echo $$d | sed 's/-/2/'`; \
echo "missing eval file for $$d"; \
mkdir -p work-tatoeba-tmp/$$d/train; \
rsync -av work-tatoeba/$$d/train/*model* work-tatoeba-tmp/$$d/train/; \
make FIT_DATA_SIZE=1000 LANGGROUP_FIT_DATA_SIZE=1000 TATOEBA_WORK=work-tatoeba-tmp tatoeba-$$p-data; \
cp work-tatoeba-tmp/$$d/test/Tatoeba-test.* work-tatoeba/$$d/test/; \
rm -fr work-tatoeba-tmp/$$d; \
fi \
fi \
done
tatoeba-touch-test:
for d in `find work-tatoeba/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \
if [ -e work-tatoeba/$$d/test/Tatoeba-test.src ]; then \
if [ -e work-tatoeba/$$d/val/Tatoeba-dev.src ]; then \
touch -r work-tatoeba/$$d/val/Tatoeba-dev.src work-tatoeba/$$d/test/Tatoeba-test.src*; \
touch -r work-tatoeba/$$d/val/Tatoeba-dev.src work-tatoeba/$$d/test/Tatoeba-test.trg*; \
fi \
fi \
done

View File

@ -61,9 +61,9 @@ endif
%.submitcpu:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "${LANGPAIRSTR}${@:.submitcpu=}"' >>$@
echo '#SBATCH -o ${LANGPAIRSTR}${@:.submitcpu=}.out.%j' >> $@
echo '#SBATCH -e ${LANGPAIRSTR}${@:.submitcpu=}.err.%j' >> $@
echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submitcpu=}"' >>$@
echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.out.%j' >> $@
echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@

View File

@ -2,8 +2,15 @@
#
# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
#
#
# replace SPMENCODE with your own setup!
#
# CHANGES
#
# * issue with perl code that removes control characters
# unicode property Other = \p{C}) seems to remove
# newline characters as well --> add negative lookahead
# to avoid removing newline characters!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
@ -90,7 +97,7 @@ else
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
perl -C -pe 's/(?!\n)\p{C}/ /g;'
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"

View File

@ -3,6 +3,14 @@
# USAGE preprocess.sh langid spmodel < input > output
#
# replace SPMENCODE with your own setup!
#
# CHANGES
#
# * issue with perl code that removes control characters
# unicode property Other = \p{C}) seems to remove
# newline characters as well --> add negative lookahead
# to avoid removing newline characters!
#
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
@ -49,7 +57,7 @@ sed -e 's//,/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
perl -C -pe 's/(?!\n)\p{C}/ /g;'
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2

View File

@ -30,6 +30,18 @@ ${TICO19_TEST}: %/tico19-test.en.gz: tico19-testset/test/test.%.tsv
cut -f4 $< | tail -n +2 | sed 's/^ *//;s/ *$$//' | gzip -c > ${@:en.gz=${patsubst en-%/,%,$(dir $@)}}.gz
TICODATADIRS = $(sort $(subst /,,${dir ${wildcard */tico19-test.*}}))
crosslink-tico:
-for d in ${TICODATADIRS}; do \
s=`echo "$$d" | cut -f1 -d'-'`; \
t=`echo "$$d" | cut -f2 -d'-'`; \
mkdir -p $$t-$$s; \
cd $$t-$$s; \
ln -s ../$$d/tico19* .; \
cd ..; \
done

View File

@ -0,0 +1 @@
../en-am/tico19-test.am.gz

View File

@ -0,0 +1 @@
../en-am/tico19-test.amh.gz

View File

@ -0,0 +1 @@
../en-am/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-am/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../eng-amh/tico19-test.am.gz

View File

@ -0,0 +1 @@
../eng-amh/tico19-test.amh.gz

View File

@ -0,0 +1 @@
../eng-amh/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-amh/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-ar/tico19-test.ar.gz

View File

@ -0,0 +1 @@
../en-ar/tico19-test.ara.gz

View File

@ -0,0 +1 @@
../en-ar/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-ar/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../eng-ara/tico19-test.ar.gz

View File

@ -0,0 +1 @@
../eng-ara/tico19-test.ara.gz

View File

@ -0,0 +1 @@
../eng-ara/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-ara/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../eng-ben/tico19-test.ben.gz

View File

@ -0,0 +1 @@
../eng-ben/tico19-test.bn.gz

View File

@ -0,0 +1 @@
../eng-ben/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-ben/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-bn/tico19-test.ben.gz

View File

@ -0,0 +1 @@
../en-bn/tico19-test.bn.gz

View File

@ -0,0 +1 @@
../en-bn/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-bn/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.es-LA.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.spa.gz

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.es-LA.gz

View File

@ -0,0 +1 @@
../en-pt_BR/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-pt_BR/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-pt_BR/tico19-test.pt-BR.gz

View File

@ -0,0 +1 @@
../en-pt_BR/tico19-test.pt-BR.gz

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.pt-BR.gz

View File

@ -0,0 +1 @@
../en-ti_ER/tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.en-ti_ER.en.gz

View File

@ -0,0 +1 @@
../en-ti_ER/tico19-test.ti_ER.gz

View File

@ -0,0 +1 @@
tico19-test.en-ti_ER.ti.gz

View File

@ -0,0 +1 @@
../en-ti_ET/tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.en-ti_ET.en.gz

View File

@ -0,0 +1 @@
../en-ti_ET/tico19-test.ti_ET.gz

View File

@ -0,0 +1 @@
tico19-test.en-ti_ET.ti.gz

1
testsets/eng-pob Symbolic link
View File

@ -0,0 +1 @@
en-pt_BR

1
testsets/eng-por Symbolic link
View File

@ -0,0 +1 @@
en-pt

View File

@ -0,0 +1 @@
../en-es/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-es/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-es/tico19-test.es.gz

View File

@ -0,0 +1 @@
../en-es/tico19-test.spa.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.es-LA.gz

View File

@ -0,0 +1 @@
../en-es_LA/tico19-test.spa.gz

View File

@ -0,0 +1 @@
../en-fa/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-fa/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-fa/tico19-test.fa.gz

View File

@ -0,0 +1 @@
../en-fa/tico19-test.fas.gz

View File

@ -0,0 +1 @@
../eng-fas/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-fas/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../eng-fas/tico19-test.fa.gz

View File

@ -0,0 +1 @@
../eng-fas/tico19-test.fas.gz

View File

@ -0,0 +1 @@
../en-fr/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-fr/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-fr/tico19-test.fr.gz

View File

@ -0,0 +1 @@
../en-fr/tico19-test.fra.gz

View File

@ -0,0 +1 @@
../en-ha/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-ha/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-ha/tico19-test.ha.gz

View File

@ -0,0 +1 @@
../en-ha/tico19-test.hau.gz

View File

@ -0,0 +1 @@
../eng-hau/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-hau/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../eng-hau/tico19-test.ha.gz

View File

@ -0,0 +1 @@
../eng-hau/tico19-test.hau.gz

View File

@ -0,0 +1 @@
../en-hi/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-hi/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-hi/tico19-test.hi.gz

View File

@ -0,0 +1 @@
../en-hi/tico19-test.hin.gz

View File

@ -0,0 +1 @@
../en-id/tico19-test.en.gz

View File

@ -0,0 +1 @@
../en-id/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../en-id/tico19-test.id.gz

View File

@ -0,0 +1 @@
../en-id/tico19-test.msa.gz

View File

@ -0,0 +1 @@
../eng-kau/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-kau/tico19-test.eng.gz

View File

@ -0,0 +1 @@
../eng-kau/tico19-test.kau.gz

View File

@ -0,0 +1 @@
../eng-kau/tico19-test.kr.gz

View File

@ -0,0 +1 @@
../eng-khm/tico19-test.en.gz

View File

@ -0,0 +1 @@
../eng-khm/tico19-test.eng.gz

Some files were not shown because too many files have changed in this diff Show More