added recipes for tatoeba models other than English

This commit is contained in:
Joerg Tiedemann 2021-05-04 08:49:16 +03:00
parent cde8f0d0af
commit 200662863e
13 changed files with 971 additions and 308 deletions

View File

@ -523,17 +523,17 @@ endif
@echo "" >> ${dir ${DEV_SRC}}/README.md
@echo -n "* devset-selected: top " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled" >> ${dir ${DEV_SRC}}/README.md
ifeq (${DEVSET},${TESTSET})
@echo -n "* testset-selected: next " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled " >> ${dir ${DEV_SRC}}/README.md
@echo "* devset-unused: added to traindata" >> ${dir ${DEV_SRC}}/README.md
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
@echo "" >> ${dir ${TEST_SRC}}/README.md
@echo -n "testset-selected: next " >> ${dir ${TEST_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
@echo " lines of ../val/${notdir $@}.shuffled" >> ${dir ${TEST_SRC}}/README.md
endif

View File

@ -34,6 +34,8 @@ get-model-distro = ${shell echo ${wildcard ${1}/${2}/*.zip} | tr ' ' "\n" | LAN
find-model:
@echo ${call get-model-dist,${LANGPAIRSTR}}
@ -42,7 +44,12 @@ find-model:
MIN_BLEU_SCORE = 20
.PHONY: dist local-dist global-dist release
dist: ${DIST_PACKAGE}
## create a symbolic link to the latest model
## and make the package
dist:
${MAKE} link-latest-model
${MAKE} ${DIST_PACKAGE}
## local distribution in workhome, no restrictions about BLEU
local-dist:
@ -206,9 +213,9 @@ RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}
## advantage: list all labels that are valid in the model
## disadvantage: can be misleading because we may have labels that are not trained
##
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
LANGUAGELABELSUSED = $(filter ${TRGLANGS},${LANGUAGELABELSRAW})
model-yml: ${MODEL_YML}
@ -249,24 +256,15 @@ ${MODEL_YML}: ${MODEL_FINAL}
@echo "dataset-name: $(DATASET)" >> $@
@echo "modeltype: $(MODELTYPE)" >> $@
@echo "vocabulary:" >> $@
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
@echo "subwords:" >> $@
@echo " source: ${PRE_SRC}" >> $@
@echo " target: ${PRE_TRG}" >> $@
@echo " source: ${PRE_SRC}" >> $@
@echo " target: ${PRE_TRG}" >> $@
@echo "subword-models:" >> $@
@echo " source: source.${SUBWORD_TYPE}" >> $@
@echo " target: target.${SUBWORD_TYPE}" >> $@
ifdef USE_TARGET_LABELS
@echo "use-target-labels:" >> $@
@for t in ${LANGUAGELABELSRAW}; do \
echo " - \">>$$t<<\"" >> $@; \
done
# @for t in ${TRGLANGS}; do \
# echo " - '>>$$t<<'" >> $@; \
# done
endif
@echo " source: source.${SUBWORD_TYPE}" >> $@
@echo " target: target.${SUBWORD_TYPE}" >> $@
@echo "source-languages:" >> $@
@for s in ${RAWSRCLANGS}; do\
echo " - $$s" >> $@; \
@ -275,17 +273,26 @@ endif
@for t in ${RAWTRGLANGS}; do\
echo " - $$t" >> $@; \
done
ifdef USE_TARGET_LABELS
@echo "use-target-labels:" >> $@
@for t in ${LANGUAGELABELSUSED}; do \
echo " - \">>$$t<<\"" >> $@; \
done
endif
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
@echo "training-data:" >> $@
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
tr "#" "\n" | grep '^ ${DATASET}~' | \
tail -1 | tr "~" "\n" | grep '^\* ' | \
grep -v ': *$$' | grep -v ' 0$$' | \
grep -v 'unused dev/test' | \
grep -v 'total size' | sed 's/^\* / /' >> $@
endif
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
@echo "validation-data:" >> $@
grep '^\* ' ${WORKDIR}/val/README.md | \
sed 's/total size of shuffled dev data:/total-size-shuffled:/' | \
sed 's/devset =/devset-selected:/' | \
grep -v ' 0$$' | \
sed 's/^\* / /' >> $@
endif
@ -390,7 +397,12 @@ endif
link-latest-model:
if [ `ls ${patsubst %.zip,%-*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
cd ${dir ${DIST_PACKAGE}}; \
ln -s `ls -t ${patsubst %.zip,%-*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
${notdir ${DIST_PACKAGE}}; \
fi
${DIST_PACKAGE}: ${MODEL_FINAL}
@ -505,182 +517,6 @@ endif
##### ------------------------------------
##### OLD release recipe: all in one
##### ------------------------------------
# ${DIST_PACKAGE}: ${MODEL_FINAL}
# ifneq (${SKIP_DIST_EVAL},1)
# @${MAKE} $(TEST_EVALUATION)
# @${MAKE} $(TEST_COMPARISON)
# endif
# @mkdir -p ${dir $@}
# @touch ${WORKDIR}/source.tcmodel
# @cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
# @cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
# @cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
# @cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
# ##-----------------------------
# ## create YML file
# ##-----------------------------
# @echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
# @echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
# @echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
# @echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
# @echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
# @echo "subwords:" >> ${@:.zip=}-${DATE}.yml
# @echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
# @echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
# @echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
# @echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
# @echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
# ifdef USE_TARGET_LABELS
# @echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
# @for t in ${TRGLANGS}; do \
# echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
# done
# endif
# @echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
# @for s in ${RAWSRCLANGS}; do\
# echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
# done
# @echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
# @for t in ${RAWTRGLANGS}; do\
# echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
# done
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
# @echo "training-data:" >> ${@:.zip=}-${DATE}.yml
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
# tr "#" "\n" | grep '^ ${DATASET}~' | \
# tail -1 | tr "~" "\n" | grep '^\* ' | \
# grep -v ': *$$' | grep -v ' 0$$' | \
# grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
# endif
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
# @echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
# grep '^\* ' ${WORKDIR}/val/README.md | \
# grep -v ' 0$$' | \
# sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
# endif
# ##-----------------------------
# ## create README-file
# ##-----------------------------
# @echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# @echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
# @echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
# @echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
# @echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
# ifdef USE_TARGET_LABELS
# echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
# @echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
# endif
# @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
# ifneq (${SKIP_DATA_DETAILS},1)
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
# @echo -n "## Training data: " >> ${WORKDIR}/README.md
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
# tr "#" "\n" | grep '${DATASET}' | \
# tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# endif
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
# @echo -n "#" >> ${WORKDIR}/README.md
# @cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# endif
# endif
# ##-----------------------------
# ## add benchmark results
# ##-----------------------------
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
# @echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
# @echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# @echo '## Benchmarks' >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# ## grep and normalise test set names
# ## ugly perl script that does some tansformation of language codes
# @grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
# perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f3 -d ' ' > $@.2
# @grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f3 -d ' ' > $@.3
# @ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f16 -d ' ' | sed 's/)//' > $@.5
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f7 -d ' ' > $@.6
# @paste -d '/' $@.4 $@.5 > $@.7
# @echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
# @echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
# @paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
# sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
# sort | uniq >> ${WORKDIR}/README.md
# @echo "test-data:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
# @echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
# @echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
# @rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
# endif
# ##-----------------------------
# ## create the package
# ##-----------------------------
# @cat ${WORKDIR}/README.md >> ${dir $@}README.md
# @echo '' >> ${dir $@}README.md
# @cp models/LICENSE ${WORKDIR}/
# @chmod +x ${WORKDIR}/preprocess.sh
# @sed -e 's# - .*/\([^/]*\)$$# - \1#' \
# -e 's/beam-size: [0-9]*$$/beam-size: 6/' \
# -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
# -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
# -e 's/relative-paths: false/relative-paths: true/' \
# < ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
# cd ${WORKDIR} && zip ${notdir $@} \
# README.md LICENSE \
# ${notdir ${MODEL_FINAL}} \
# ${notdir ${MODEL_SRCVOCAB}} \
# ${notdir ${MODEL_TRGVOCAB}} \
# ${notdir ${MODEL_VALIDLOG}} \
# ${notdir ${MODEL_TRAINLOG}} \
# source.* target.* decoder.yml \
# preprocess.sh postprocess.sh
# ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
# @cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
# endif
# ##-----------------------------
# ## move files to release dir and cleanup
# ##-----------------------------
# @mkdir -p ${dir $@}
# @mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
# @cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
# @cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
# @cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
# endif
# @rm -f $@
# @cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
# @rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
# @rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
## do this only if the flag is set
## --> avoid expensive wildcard searches each time make is called
@ -695,6 +531,7 @@ endif
# source project_2000661-openrc.sh
#
# - make upload ......... released models = all sub-dirs in models/
# - make upload-model ... upload model for current language pair
# - make upload-models .. trained models in current WORKHOME to OPUS-MT-dev
# - make upload-scores .. score file with benchmark results to OPUS-MT-eval
# - make upload-eval .... benchmark tests from models in WORKHOME
@ -712,6 +549,17 @@ upload:
swift upload ${MODEL_CONTAINER} index.txt
rm -f index.txt
.PHONY: upload-model
upload-model:
find ${RELEASEDIR}/ -type l | tar -cf models-links.tar -T -
find ${RELEASEDIR}/ -type l -delete
cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical ${LANGPAIRSTR}
tar -xf models-links.tar
rm -f models-links.tar
swift post ${MODEL_CONTAINER} --read-acl ".r:*"
swift list ${MODEL_CONTAINER} > index.txt
swift upload ${MODEL_CONTAINER} index.txt
rm -f index.txt
.PHONY: upload-models
upload-models:
@ -968,7 +816,7 @@ dist-remove-no-date-dist:
dist-remove-old-yml:
swift list Tatoeba-MT-models > index.txt
for d in `grep old-yml index.txt`; do \
for d in `grep yml-old index.txt`; do \
swift delete Tatoeba-MT-models $$d; \
done
@ -993,3 +841,21 @@ dist-fix-preprocess:
rm -f $$d; \
done )
## fix yet another error in YAML files
# YMLFILES = ${wildcard models-tatoeba/eng-*/*-2021-04-10.yml}
# OLDYMLFILES = ${patsubst %.yml,%.yml-old,${YMLFILES}}
# ${OLDYMLFILES}: %.yml-old: %.yml
# mv $< $@
# sed -e 's/devset =/devset-selected:/' \
# -e 's/testset =/testset-selected:/' \
# -e 's/total size of shuffled dev data:/total-size-shuffled:/' < $@ |\
# grep -v 'unused dev/test' > $<
# touch $@
# fix-yml-files: ${OLDYMLFILES}

View File

@ -68,8 +68,8 @@ WORKHOME = ${PWD}/work
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
# CSCPROJECT = project_2002688
CSCPROJECT = project_2003093
CSCPROJECT = project_2002688
# CSCPROJECT = project_2003093
# CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
APPLHOME = /projappl/project_2003093/
@ -182,7 +182,6 @@ MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
## BPE
SUBWORD_BPE ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
SUBWORD_HOME ?= ${dir ${SUBWORD_BPE}}

View File

@ -270,6 +270,18 @@ endif
# CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
## train on back-translations only
%-btonly:
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+btonly \
USE_BACKTRANS=1 \
CONTINUE_EXISTING=1 \
MODELCONFIG=config-bt.mk \
TRAINSET= TATOEBA_TRAINSET= \
${@:-btonly=}
PIVOT_MODEL = ${MODEL_SUBDIR}${DATASET}+pivot${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
PIVOT_MODEL_BASE = ${PIVOT_MODEL}.${MODELTYPE}.model${NR}

View File

@ -13,12 +13,18 @@
## - should we increase the length filter when cleaning later? How much?
## - should we apply some other cleanup scripts here to get rid of some messy stuff?
## should we remove zero-width spaces?
## perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
cat ${word 1,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.1
cat ${word 2,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.2
paste $@.1 $@.2 |\
scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext

View File

@ -106,6 +106,24 @@ TATOEBA_WORK ?= ${PWD}/work-tatoeba
TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE}
TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono
## data count files (file basename)
TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT}/data/release/${TATOEBA_VERSION}/released-bitexts
## all released language pairs with test sets > 200 test pairs
## also extract all source languages that are available for a give target language
## and vice versa
TATOEBA_RELEASED_DATA = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
## extract language pairs for a specific subset
TATOEBA_SUBSET = lower
TATOEBA_RELEASED_SUBSET = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
WIKILANGS ?= ${notdir ${wildcard backtranslate/wiki-iso639-3/*}}
WIKIMACROLANGS ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}})
@ -119,7 +137,7 @@ TATOEBA_TESTSET_NAME = Tatoeba-test
TATOEBA_RELEASEDIR = ${PWD}/models-tatoeba
TATOEBA_MODELSHOME = ${PWD}/models-tatoeba
TATOEBA_BTHOME = ${PWD}/bt-tatoeba
TATOEBA_MIN_BLEU = 10
## file with the source and target languages in the current model
@ -163,7 +181,7 @@ TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
ALT_MODEL_DIR=tatoeba \
SKIP_DATA_DETAILS=1 \
MIN_BLEU_SCORE=10
MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU}
@ -340,6 +358,92 @@ tatoeba-refresh-finished:
done
###########################################################################################
# start combinations with a specific source/target language
###########################################################################################
#
# make SRC=deu tatoeba-src2all-reasonable
# make SRC=deu tatoeba-src2all-small
#
# make TRG=deu tatoeba-all2trg-reasonable
# make TRG=deu tatoeba-all2trg-small
#
tatoeba-src2all:
for l in ${TATOEBA_AVAILABLE_SUBSET_TRG}; do \
${MAKE} tatoeba-${SRC}2$$l-train; \
done
tatoeba-src2langgroup:
for l in ${sort ${shell langgroup -p -n ${TATOEBA_AVAILABLE_SUBSET_TRG} 2>/dev/null}}; do \
${MAKE} tatoeba-${SRC}2$$l-train-1m; \
done
tatoeba-all2trg:
for l in ${TATOEBA_AVAILABLE_SUBSET_SRC}; do \
${MAKE} tatoeba-$${l}2${TRG}-train; \
done
tatoeba-all2trg-print:
for l in ${TATOEBA_AVAILABLE_SUBSET_SRC}; do \
echo "${MAKE} tatoeba-$${l}2${TRG}-train"; \
done
tatoeba-langgroup2trg:
for l in ${sort ${shell langgroup -p -n ${TATOEBA_AVAILABLE_SUBSET_SRC} 2>/dev/null}}; do \
${MAKE} tatoeba-$${l}2${TRG}-train-1m; \
done
## all subsets
tatoeba-src2all-subsets:
${MAKE} TATOEBA_SUBSET=lowest tatoeba-src2all
${MAKE} TATOEBA_SUBSET=lower tatoeba-src2all
${MAKE} TATOEBA_SUBSET=medium tatoeba-src2all
${MAKE} TATOEBA_SUBSET=higher tatoeba-src2all
${MAKE} TATOEBA_SUBSET=highest tatoeba-src2all
tatoeba-all2trg-subsets:
${MAKE} TATOEBA_SUBSET=lowest tatoeba-all2trg
${MAKE} TATOEBA_SUBSET=lower tatoeba-all2trg
${MAKE} TATOEBA_SUBSET=medium tatoeba-all2trg
${MAKE} TATOEBA_SUBSET=higher tatoeba-all2trg
${MAKE} TATOEBA_SUBSET=highest tatoeba-all2trg
## reasonable size (all except lower and lowest)
tatoeba-src2all-reasonable:
${MAKE} TATOEBA_SUBSET=medium tatoeba-src2all
${MAKE} TATOEBA_SUBSET=higher tatoeba-src2all
${MAKE} TATOEBA_SUBSET=highest tatoeba-src2all
tatoeba-all2trg-reasonable:
${MAKE} TATOEBA_SUBSET=medium tatoeba-all2trg
${MAKE} TATOEBA_SUBSET=higher tatoeba-all2trg
${MAKE} TATOEBA_SUBSET=highest tatoeba-all2trg
## backoff to multilingual models and language groups
## lower / lowest resource languages and zero-shot
tatoeba-src2all-small:
${MAKE} TATOEBA_SUBSET=lower tatoeba-src2langgroup
${MAKE} TATOEBA_SUBSET=lowest tatoeba-src2langgroup
${MAKE} TATOEBA_SUBSET=zero-shot tatoeba-src2langgroup
tatoeba-all2trg-small:
${MAKE} TATOEBA_SUBSET=lower tatoeba-langgroup2trg
${MAKE} TATOEBA_SUBSET=lowest tatoeba-langgroup2trg
${MAKE} TATOEBA_SUBSET=zero-shot tatoeba-langgroup2trg
###########################################################################################
# models for backtranslation
@ -1732,11 +1836,12 @@ ${TATOEBA_MONO}/%.labels:
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
grep -P "$$s\t$$t\t" | cut -f3,4 |\
scripts/filter/filter-korean.sh ${SRC} ${TRG} $$d > ${dir $@}Tatoeba-$$d.$$s-$$t; \
if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
echo "........ make ${dir $@}Tatoeba-$$d.$$s-$$t.clean.*.gz"; \
cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
cut -f1 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
cut -f2 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
fi; \
rm -f ${dir $@}Tatoeba-$$d.$$s-$$t; \
fi \
@ -1748,11 +1853,12 @@ ${TATOEBA_MONO}/%.labels:
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
grep -P "$$s\t$$t\t" | cut -f3,4 |\
scripts/filter/filter-korean.sh ${TRG} ${SRC} $$d > ${dir $@}Tatoeba-$$d.$$t-$$s; \
if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
echo "........ make ${dir $@}Tatoeba-$$d.$$t-$$s.clean.*.gz"; \
cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
cut -f1 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
cut -f2 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
fi; \
rm -f ${dir $@}Tatoeba-$$d.$$t-$$s; \
fi \

View File

@ -9,6 +9,11 @@ as-en:
${MAKE} reverse-data-as-en
${MAKE} train-dynamic-en-as
BCL_DEVSIZE = 1000
BCL_TESTSIZE = 1000
en-bcl:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia all-job
@ -29,25 +34,31 @@ bcl-en-nt:
DEVSET=wikimedia all-job
%-en-bcl:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DEVSET=wikimedia \
DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
USE_REST_DEVDATA=1 ${@:-en-bcl=}
%-bcl-en:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-bcl-en=}
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DEVSET=wikimedia \
DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
USE_REST_DEVDATA=1 ${@:-bcl-en=}
%-en-bcl-nt:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia \
DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
${@:-en-bcl-nt=}
%-bcl-en-nt:
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia \
DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
${@:-bcl-en-nt=}
@ -72,34 +83,228 @@ ENBCL_BPE = 1000
#-----------------------------------------------------------------------------
# start jobs for all languages where we have back-translations into English
#-----------------------------------------------------------------------------
## languages for which we have back translated wiki data into English
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul bos_Latn cmn_Hans cmn_Hant hrv ind nno nob srp_Cyrl srp_Latn
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul
## start jobs for all languages where we have back-translations
## start jobs for all languages where we have back-translations into English
wiki-eng2all-with-bt:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
echo "fetch back-translations for $$l-eng"; \
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
echo "fetch $$l wiki backtranslations"; \
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
done
for l in ${sort ${shell iso639 -m -n ${WIKI_BT2ENG}}}; do \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
done
# for l in ${WIKI_BT2ENG}; do \
# if [ -d work-tatoeba/$$l-eng ]; then \
# if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
# echo "fetch back-translations for $$l-eng"; \
# ${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
# echo "start training eng-$$l with backtranslation data"; \
# ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
# fi \
# fi \
# done
wiki-eng2all-with-bt-continue:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-train-bt; \
fi \
fi \
done
WIKI_BT2ENG_PARENTS = ${sort ${shell langgroup -p ${WIKI_BT2ENG}}}
wiki-eng2all-with-bt-eval:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=4 tatoeba-eng2$$l-evalall-bt.submit; \
fi \
fi \
done
wiki-eng2allgroups-with-bt:
for l in $(filter-out roa,${WIKI_BT2ENG_PARENTS}); do \
# if [ `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-eng2all-with-bt-dist:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
echo "mv work-tatoeba/eng-$$l work-tatoeba-old"; \
mv work-tatoeba/eng-$$l work-tatoeba-old; \
fi; \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt; \
fi \
fi \
done
#-----------------------------------------------------------------------------
# models for translating English into language groups with backtranslations
# (does not fetch back-translations - they need to be available in bt-tatoeba!)
#-----------------------------------------------------------------------------
WIKI_BT2ENG_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_BT2ENG} | xargs langgroup -p}}
wiki-eng2allgroups-with-bt:
for l in ${WIKI_BT2ENG_PARENTS}; do \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
done
wiki-eng2allgroups-with-bt-continue:
for l in ${WIKI_BT2ENG_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-train-bt-1m; \
fi \
fi \
done
wiki-eng2allgroups-with-bt-eval:
for l in ${WIKI_BT2ENG_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=8 tatoeba-eng2$$l-evalall-bt-1m.submit; \
fi \
fi \
done
# if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-eng2allgroups-with-bt-dist:
for l in ${WIKI_BT2ENG_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training eng-$$l with backtranslation data"; \
${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt-1m; \
fi \
fi \
done
#-----------------------------------------------------------------------------
# start jobs for all languages where we have back-translations from English
#-----------------------------------------------------------------------------
## languages for which we have back translated wiki data from English
WIKI_ENG2BT = afr ara aze bel ben bos_Latn bre bul cat ceb ces cmn_Hans cmn_Hant cym dan deu ell epo est eus fao fin fra fry gle glg heb hin hrv hun hye ido ilo ina ind isl ita lav lit ltz mal mar mkd mlt msa nds nld nno nob pol por ron run rus spa sqi srp_Cyrl srp_Latn swa swe tam tgl tha tur ukr urd uzb_Latn vie war zho zsm_Latn
wiki-all2eng-with-bt:
for l in ${WIKI_ENG2BT}; do \
echo "fetch $$l wiki backtranslations"; \
${MAKE} -C bt-tatoeba TRG=$$l SRC=eng fetch-bt; \
done
for l in ${sort ${shell iso639 -m -n ${WIKI_ENG2BT}}}; do \
if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
else \
echo "start training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt.submitcpu; \
fi \
fi \
done
wiki-all2eng-with-bt-continue:
for l in ${WIKI_ENG2BT}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
fi \
fi \
done
wiki-all2eng-with-bt-eval:
for l in ${WIKI_ENG2BT}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt.submit; \
fi \
fi \
done
# if [ `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-all2eng-with-bt-dist:
for l in ${WIKI_ENG2BT}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt; \
fi \
fi \
done
WIKI_ENG2BT_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_ENG2BT} | xargs langgroup -p}}
wiki-allgroups2eng-with-bt:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
else \
echo "start training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt-1m.submitcpu; \
fi \
fi \
done
wiki-allgroups2eng-with-bt-continue:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ -d work-tatoeba/eng-$$l ]; then \
if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
fi \
fi \
done
wiki-allgroups2eng-with-bt-eval:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt-1m.submit; \
fi \
fi \
done
# if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
wiki-allgroups2eng-with-bt-dist:
for l in ${WIKI_ENG2BT_PARENTS}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
echo "continue training $$l-eng with backtranslation data"; \
${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt-1m; \
fi \
fi \
done

View File

@ -1,3 +1,8 @@
#
# opus-2020-01-20.zip
@ -15,6 +20,11 @@
| JW300.bcl.en | 56.8 | 0.705 |
# opus-2020-02-11.zip
* dataset: opus
@ -31,6 +41,11 @@
| JW300.bcl.en | 56.1 | 0.697 |
# opus+bt-2020-05-23.zip
* dataset: opus+bt
@ -58,6 +73,11 @@
| JW300.bcl.en | 57.6 | 0.712 |
# opus+nt-2021-03-29.zip
* dataset: opus+nt
@ -92,3 +112,190 @@
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 10.4 | 0.320 | 525 | 27109 | 0.477 |
# opus+nt+bt-2021-04-01.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.zip)
## Training data: opus+nt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikinews.aa.en-bcl (357946)
* bcl-en: total size = 1809858
* total size (opus+nt+bt): 1809767
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.test.txt)
* test set scores: [opus+nt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 28.2 | 0.498 | 525 | 27109 | 0.799 |
# opus+nt+bt+bt-2021-04-03.zip
* dataset: opus+nt+bt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.zip)
## Training data: opus+nt+bt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* total size (opus+nt+bt+bt): 4730231
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.test.txt)
* test set scores: [opus+nt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 16.2 | 0.461 | 525 | 27109 | 1.000 |
# opus+nt+bt+bt+bt-2021-04-05.zip
* dataset: opus+nt+bt+bt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt+bt+bt-2021-04-05.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.zip)
## Training data: opus+nt+bt+bt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* total size (opus+nt+bt+bt+bt): 4730224
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt+bt-2021-04-05.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.test.txt)
* test set scores: [opus+nt+bt+bt+bt-2021-04-05.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 24.2 | 0.497 | 525 | 27109 | 1.000 |
# opus+nt+bt-2021-04-09.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt-2021-04-09.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.zip)
## Training data: opus+nt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* unused dev/test data is added to training data
* total size (opus+nt+bt): 4731419
## Validation data
* bcl-en: wikimedia, 2767
* total-size-shuffled: 1966
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 500 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-09.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.test.txt)
* test set scores: [opus+nt+bt-2021-04-09.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 33.5 | 0.562 | 500 | 28621 | 0.868 |
# opus+nt+bt-2021-04-12.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+nt+bt-2021-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.zip)
## Training data: opus+nt+bt
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
* bcl-en: total size = 4730330
* unused dev/test data is added to training data
* total size (opus+nt+bt): 4732437
## Validation data
* bcl-en: wikimedia, 5033
* total-size-shuffled: 4207
* devset-selected: top 1000 lines of wikimedia.src.shuffled!
* testset-selected: next 1000 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.test.txt)
* test set scores: [opus+nt+bt-2021-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 31.5 | 0.523 | 1000 | 31520 | 0.836 |

View File

@ -1,86 +1,261 @@
# wikimedia-2020-01-17.zip
# opus+nt+bt-2021-03-30.zip
* dataset: wikimedia
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [wikimedia-2020-01-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.zip)
* test set translations: [wikimedia-2020-01-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.test.txt)
* test set scores: [wikimedia-2020-01-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 55.3 | 0.729 |
# opus-2020-01-20.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-01-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.zip)
* test set translations: [opus-2020-01-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.test.txt)
* test set scores: [opus-2020-01-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 55.3 | 0.729 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 53.8 | 0.719 |
# opus+bt-2020-02-26.zip
* dataset: opus+bt
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus+bt-2020-02-26.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.zip)
* test set translations: [opus+bt-2020-02-26.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.test.txt)
* test set scores: [opus+bt-2020-02-26.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 54.3 | 0.722 |
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.eval.txt)
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt-2021-03-30.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.zip)
## Training data: opus+nt+bt
## Training data: opus+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (43432)
* en-bcl: total size = 525523
* total size (opus+nt+bt): 525475
* en-bcl: wikimedia (1106)
* en-bcl: total size = 1106
* unused dev/test data is added to training data
* total size (opus+bt): 458304
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-03-30.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.test.txt)
* test set scores: [opus+nt+bt-2021-03-30.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 55.7 | 0.736 |
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 17.3 | 0.426 | 525 | 28399 | 0.840 |
# opus+nt+bt+bt-2021-04-01.zip
* dataset: opus+nt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.zip)
## Training data: opus+nt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474)
* en-bcl: total size = 527565
* total size (opus+nt+bt+bt): 527524
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.test.txt)
* test set scores: [opus+nt+bt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 21.6 | 0.476 | 525 | 28399 | 0.789 |
# opus+nt+bt+bt+bt-2021-04-03.zip
* dataset: opus+nt+bt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.zip)
## Training data: opus+nt+bt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474)
* en-bcl: total size = 527565
* total size (opus+nt+bt+bt+bt): 527496
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.test.txt)
* test set scores: [opus+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 22.7 | 0.482 | 525 | 28399 | 0.895 |
# opus2+nt+bt+bt+bt-2021-04-03.zip
* dataset: opus2+nt+bt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus2+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.zip)
## Training data: opus2+nt+bt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 573039
* total size (opus2+nt+bt+bt+bt): 572969
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus2+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.test.txt)
* test set scores: [opus2+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 23.9 | 0.497 | 525 | 28399 | 0.820 |
# opus+nt+bt+bt+bt+bt-2021-04-06.zip
* dataset: opus+nt+bt+bt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt+bt+bt-2021-04-06.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.zip)
## Training data: opus+nt+bt+bt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 618513
* total size (opus+nt+bt+bt+bt+bt): 618427
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt+bt+bt-2021-04-06.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.test.txt)
* test set scores: [opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 24.4 | 0.498 | 525 | 28399 | 0.805 |
# opus+nt+bt+bt-2021-04-10.zip
* dataset: opus+nt+bt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt+bt-2021-04-10.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.zip)
## Training data: opus+nt+bt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 664007
* unused dev/test data is added to training data
* total size (opus+nt+bt+bt): 665111
## Validation data
* bcl-en: wikimedia, 2767
* total-size-shuffled: 1966
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 500 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt+bt-2021-04-10.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.test.txt)
* test set scores: [opus+nt+bt+bt-2021-04-10.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 30.7 | 0.572 | 500 | 29131 | 0.921 |
# opus+nt+bt-2021-04-11.zip
* dataset: opus+nt+bt
* model: transformer-align
* source language(s): en
* target language(s): bcl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
* download: [opus+nt+bt-2021-04-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.zip)
## Training data: opus+nt+bt
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
* en-bcl: total size = 664007
* unused dev/test data is added to training data
* total size (opus+nt+bt): 666118
## Validation data
* bcl-en: wikimedia, 5033
* total-size-shuffled: 4207
* devset-selected: top 1000 lines of wikimedia.src.shuffled!
* testset-selected: next 1000 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt+bt-2021-04-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.test.txt)
* test set scores: [opus+nt+bt-2021-04-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.en-bcl | 31.9 | 0.585 | 1000 | 27681 | 1.000 |

View File

@ -0,0 +1,31 @@
my %basemodel = ();
my %btmodel = ();
while (<>){
chomp;
s/https:\/\/object\.pouta\.csc\.fi\/Tatoeba\-MT\-models\///;
my @fields = split(/\t/);
if ($fields[3]=~/^(.*)\+bt-....-..-..\.zip/){
unless (exists $btmodel{"$fields[0]\t$1"}){
$btmodel{"$fields[0]\t$1"} = $_;
}
}
elsif ($fields[3]=~/^(.*)-....-..-..\.zip/){
unless (exists $basemodel{"$fields[0]\t$1"}){
$basemodel{"$fields[0]\t$1"} = $_;
}
}
}
foreach (sort keys %btmodel){
if (exists $basemodel{$_} and $btmodel{$_}){
print "base\t", $basemodel{$_},"\n";
print "base+bt\t", $btmodel{$_},"\n";
my @base = split(/\t/,$basemodel{$_});
my @bt = split(/\t/,$btmodel{$_});
$bt[1] = sprintf("%5.3f",$bt[1] - $base[1]);
$bt[2] = sprintf("%5.2f",$bt[2] - $base[2]);
print "diff\t", join("\t",@bt),"\n\n";
}
}

54
scripts/filter/filter-korean.sh Executable file
View File

@ -0,0 +1,54 @@
#!/usr/bin/bash
#
# extra filtering for Korean data
# filter out data that has characters other than Hang
#
# USAGE: filter-korean.sh srclangid trglangid < tab-sepatared-bitext > filtered-bitext
#
tmpsrc=`mktemp`
tmptrg=`mktemp`
tmplang=`mktemp`
if [ "$1" == "kor" ] || [ "$1" == "ko" ]; then
column=1
elif [ "$2" == "kor" ] || [ "$2" == "ko" ]; then
column=2
fi
## don't touch test sets
if [ "$3" == "test" ]; then
column=0
fi
if [ $column -gt 0 ]; then
echo "... filter Korean bitexts" >&2
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
else
cat
fi
## OLD: check script
## this is slow ....
# if [ $column -gt 0 ]; then
# echo "... filter Korean bitexts ($tmplang $tmpsrc $tmptrg)" >&2
# perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |
# tee >(cut -f1 > $tmpsrc) >(cut -f2 > $tmptrg) |
# cut -f$column |
# perl -CIOE -pe 'use utf8;s/\p{P}//g;s/[^\S\n]//g;s/▁//g;s/[0-9]//g' |
# langscript -a > $tmplang
# paste $tmplang $tmpsrc $tmptrg |
# grep $'Hang ([0-9]*)\s*\t' |
# cut -f2,3
# rm -f $tmplang $tmpsrc $tmptrg
# else
# cat
# fi

View File

@ -93,6 +93,7 @@ else
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"

View File

@ -53,6 +53,7 @@ sed -e 's//,/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2