mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
added recipes for tatoeba models other than English
This commit is contained in:
parent
cde8f0d0af
commit
200662863e
@ -523,17 +523,17 @@ endif
|
||||
@echo "" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo -n "* devset-selected: top " >> ${dir ${DEV_SRC}}/README.md
|
||||
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled" >> ${dir ${DEV_SRC}}/README.md
|
||||
ifeq (${DEVSET},${TESTSET})
|
||||
@echo -n "* testset-selected: next " >> ${dir ${DEV_SRC}}/README.md
|
||||
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled " >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "* devset-unused: added to traindata" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
|
||||
@echo "" >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo -n "testset-selected: next " >> ${dir ${TEST_SRC}}/README.md
|
||||
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo " lines of ../val/${notdir $@}.shuffled" >> ${dir ${TEST_SRC}}/README.md
|
||||
endif
|
||||
|
||||
|
||||
|
260
lib/dist.mk
260
lib/dist.mk
@ -34,6 +34,8 @@ get-model-distro = ${shell echo ${wildcard ${1}/${2}/*.zip} | tr ' ' "\n" | LAN
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
find-model:
|
||||
@echo ${call get-model-dist,${LANGPAIRSTR}}
|
||||
|
||||
@ -42,7 +44,12 @@ find-model:
|
||||
MIN_BLEU_SCORE = 20
|
||||
|
||||
.PHONY: dist local-dist global-dist release
|
||||
dist: ${DIST_PACKAGE}
|
||||
|
||||
## create a symbolic link to the latest model
|
||||
## and make the package
|
||||
dist:
|
||||
${MAKE} link-latest-model
|
||||
${MAKE} ${DIST_PACKAGE}
|
||||
|
||||
## local distribution in workhome, no restrictions about BLEU
|
||||
local-dist:
|
||||
@ -206,9 +213,9 @@ RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}
|
||||
## advantage: list all labels that are valid in the model
|
||||
## disadvantage: can be misleading because we may have labels that are not trained
|
||||
##
|
||||
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
|
||||
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
|
||||
|
||||
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
|
||||
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
|
||||
LANGUAGELABELSUSED = $(filter ${TRGLANGS},${LANGUAGELABELSRAW})
|
||||
|
||||
|
||||
model-yml: ${MODEL_YML}
|
||||
@ -249,24 +256,15 @@ ${MODEL_YML}: ${MODEL_FINAL}
|
||||
@echo "dataset-name: $(DATASET)" >> $@
|
||||
@echo "modeltype: $(MODELTYPE)" >> $@
|
||||
@echo "vocabulary:" >> $@
|
||||
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
|
||||
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
|
||||
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
|
||||
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
|
||||
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
|
||||
@echo "subwords:" >> $@
|
||||
@echo " source: ${PRE_SRC}" >> $@
|
||||
@echo " target: ${PRE_TRG}" >> $@
|
||||
@echo " source: ${PRE_SRC}" >> $@
|
||||
@echo " target: ${PRE_TRG}" >> $@
|
||||
@echo "subword-models:" >> $@
|
||||
@echo " source: source.${SUBWORD_TYPE}" >> $@
|
||||
@echo " target: target.${SUBWORD_TYPE}" >> $@
|
||||
ifdef USE_TARGET_LABELS
|
||||
@echo "use-target-labels:" >> $@
|
||||
@for t in ${LANGUAGELABELSRAW}; do \
|
||||
echo " - \">>$$t<<\"" >> $@; \
|
||||
done
|
||||
# @for t in ${TRGLANGS}; do \
|
||||
# echo " - '>>$$t<<'" >> $@; \
|
||||
# done
|
||||
endif
|
||||
@echo " source: source.${SUBWORD_TYPE}" >> $@
|
||||
@echo " target: target.${SUBWORD_TYPE}" >> $@
|
||||
@echo "source-languages:" >> $@
|
||||
@for s in ${RAWSRCLANGS}; do\
|
||||
echo " - $$s" >> $@; \
|
||||
@ -275,17 +273,26 @@ endif
|
||||
@for t in ${RAWTRGLANGS}; do\
|
||||
echo " - $$t" >> $@; \
|
||||
done
|
||||
ifdef USE_TARGET_LABELS
|
||||
@echo "use-target-labels:" >> $@
|
||||
@for t in ${LANGUAGELABELSUSED}; do \
|
||||
echo " - \">>$$t<<\"" >> $@; \
|
||||
done
|
||||
endif
|
||||
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
@echo "training-data:" >> $@
|
||||
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
tr "#" "\n" | grep '^ ${DATASET}~' | \
|
||||
tail -1 | tr "~" "\n" | grep '^\* ' | \
|
||||
grep -v ': *$$' | grep -v ' 0$$' | \
|
||||
grep -v 'unused dev/test' | \
|
||||
grep -v 'total size' | sed 's/^\* / /' >> $@
|
||||
endif
|
||||
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
@echo "validation-data:" >> $@
|
||||
grep '^\* ' ${WORKDIR}/val/README.md | \
|
||||
sed 's/total size of shuffled dev data:/total-size-shuffled:/' | \
|
||||
sed 's/devset =/devset-selected:/' | \
|
||||
grep -v ' 0$$' | \
|
||||
sed 's/^\* / /' >> $@
|
||||
endif
|
||||
@ -390,7 +397,12 @@ endif
|
||||
|
||||
|
||||
|
||||
|
||||
link-latest-model:
|
||||
if [ `ls ${patsubst %.zip,%-*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
cd ${dir ${DIST_PACKAGE}}; \
|
||||
ln -s `ls -t ${patsubst %.zip,%-*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
|
||||
${notdir ${DIST_PACKAGE}}; \
|
||||
fi
|
||||
|
||||
|
||||
${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
@ -505,182 +517,6 @@ endif
|
||||
|
||||
|
||||
|
||||
##### ------------------------------------
|
||||
##### OLD release recipe: all in one
|
||||
##### ------------------------------------
|
||||
|
||||
|
||||
# ${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
# ifneq (${SKIP_DIST_EVAL},1)
|
||||
# @${MAKE} $(TEST_EVALUATION)
|
||||
# @${MAKE} $(TEST_COMPARISON)
|
||||
# endif
|
||||
# @mkdir -p ${dir $@}
|
||||
# @touch ${WORKDIR}/source.tcmodel
|
||||
# @cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
|
||||
# @cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
|
||||
# @cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
|
||||
# @cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
|
||||
# ##-----------------------------
|
||||
# ## create YML file
|
||||
# ##-----------------------------
|
||||
# @echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
|
||||
# @echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "subwords:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
|
||||
# ifdef USE_TARGET_LABELS
|
||||
# @echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @for t in ${TRGLANGS}; do \
|
||||
# echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
|
||||
# done
|
||||
# endif
|
||||
# @echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @for s in ${RAWSRCLANGS}; do\
|
||||
# echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
|
||||
# done
|
||||
# @echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @for t in ${RAWTRGLANGS}; do\
|
||||
# echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
|
||||
# done
|
||||
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
# @echo "training-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
# tr "#" "\n" | grep '^ ${DATASET}~' | \
|
||||
# tail -1 | tr "~" "\n" | grep '^\* ' | \
|
||||
# grep -v ': *$$' | grep -v ' 0$$' | \
|
||||
# grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# endif
|
||||
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
# @echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
# grep '^\* ' ${WORKDIR}/val/README.md | \
|
||||
# grep -v ' 0$$' | \
|
||||
# sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## create README-file
|
||||
# ##-----------------------------
|
||||
# @echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# @echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
|
||||
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
# @echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
|
||||
# @echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
|
||||
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
# @echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
|
||||
# ifdef USE_TARGET_LABELS
|
||||
# echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
|
||||
# @echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
|
||||
# endif
|
||||
# @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
||||
# ifneq (${SKIP_DATA_DETAILS},1)
|
||||
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
# @echo -n "## Training data: " >> ${WORKDIR}/README.md
|
||||
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
# tr "#" "\n" | grep '${DATASET}' | \
|
||||
# tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# endif
|
||||
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
# @echo -n "#" >> ${WORKDIR}/README.md
|
||||
# @cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# endif
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## add benchmark results
|
||||
# ##-----------------------------
|
||||
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
# @echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
|
||||
# @echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# @echo '## Benchmarks' >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# ## grep and normalise test set names
|
||||
# ## ugly perl script that does some tansformation of language codes
|
||||
# @grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
|
||||
# perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
|
||||
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f3 -d ' ' > $@.2
|
||||
# @grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f3 -d ' ' > $@.3
|
||||
# @ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
|
||||
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f16 -d ' ' | sed 's/)//' > $@.5
|
||||
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f7 -d ' ' > $@.6
|
||||
# @paste -d '/' $@.4 $@.5 > $@.7
|
||||
# @echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
|
||||
# @echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
|
||||
# @paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
|
||||
# sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
|
||||
# sort | uniq >> ${WORKDIR}/README.md
|
||||
# @echo "test-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# @rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## create the package
|
||||
# ##-----------------------------
|
||||
# @cat ${WORKDIR}/README.md >> ${dir $@}README.md
|
||||
# @echo '' >> ${dir $@}README.md
|
||||
# @cp models/LICENSE ${WORKDIR}/
|
||||
# @chmod +x ${WORKDIR}/preprocess.sh
|
||||
# @sed -e 's# - .*/\([^/]*\)$$# - \1#' \
|
||||
# -e 's/beam-size: [0-9]*$$/beam-size: 6/' \
|
||||
# -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
|
||||
# -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
|
||||
# -e 's/relative-paths: false/relative-paths: true/' \
|
||||
# < ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
|
||||
# cd ${WORKDIR} && zip ${notdir $@} \
|
||||
# README.md LICENSE \
|
||||
# ${notdir ${MODEL_FINAL}} \
|
||||
# ${notdir ${MODEL_SRCVOCAB}} \
|
||||
# ${notdir ${MODEL_TRGVOCAB}} \
|
||||
# ${notdir ${MODEL_VALIDLOG}} \
|
||||
# ${notdir ${MODEL_TRAINLOG}} \
|
||||
# source.* target.* decoder.yml \
|
||||
# preprocess.sh postprocess.sh
|
||||
# ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
|
||||
# @cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## move files to release dir and cleanup
|
||||
# ##-----------------------------
|
||||
# @mkdir -p ${dir $@}
|
||||
# @mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
|
||||
# @cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
|
||||
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
# @cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
|
||||
# @cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
|
||||
# endif
|
||||
# @rm -f $@
|
||||
# @cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
|
||||
# @rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
|
||||
# @rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## do this only if the flag is set
|
||||
## --> avoid expensive wildcard searches each time make is called
|
||||
@ -695,6 +531,7 @@ endif
|
||||
# source project_2000661-openrc.sh
|
||||
#
|
||||
# - make upload ......... released models = all sub-dirs in models/
|
||||
# - make upload-model ... upload model for current language pair
|
||||
# - make upload-models .. trained models in current WORKHOME to OPUS-MT-dev
|
||||
# - make upload-scores .. score file with benchmark results to OPUS-MT-eval
|
||||
# - make upload-eval .... benchmark tests from models in WORKHOME
|
||||
@ -712,6 +549,17 @@ upload:
|
||||
swift upload ${MODEL_CONTAINER} index.txt
|
||||
rm -f index.txt
|
||||
|
||||
.PHONY: upload-model
|
||||
upload-model:
|
||||
find ${RELEASEDIR}/ -type l | tar -cf models-links.tar -T -
|
||||
find ${RELEASEDIR}/ -type l -delete
|
||||
cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical ${LANGPAIRSTR}
|
||||
tar -xf models-links.tar
|
||||
rm -f models-links.tar
|
||||
swift post ${MODEL_CONTAINER} --read-acl ".r:*"
|
||||
swift list ${MODEL_CONTAINER} > index.txt
|
||||
swift upload ${MODEL_CONTAINER} index.txt
|
||||
rm -f index.txt
|
||||
|
||||
.PHONY: upload-models
|
||||
upload-models:
|
||||
@ -968,7 +816,7 @@ dist-remove-no-date-dist:
|
||||
|
||||
dist-remove-old-yml:
|
||||
swift list Tatoeba-MT-models > index.txt
|
||||
for d in `grep old-yml index.txt`; do \
|
||||
for d in `grep yml-old index.txt`; do \
|
||||
swift delete Tatoeba-MT-models $$d; \
|
||||
done
|
||||
|
||||
@ -993,3 +841,21 @@ dist-fix-preprocess:
|
||||
rm -f $$d; \
|
||||
done )
|
||||
|
||||
|
||||
|
||||
## fix yet another error in YAML files
|
||||
|
||||
# YMLFILES = ${wildcard models-tatoeba/eng-*/*-2021-04-10.yml}
|
||||
# OLDYMLFILES = ${patsubst %.yml,%.yml-old,${YMLFILES}}
|
||||
|
||||
# ${OLDYMLFILES}: %.yml-old: %.yml
|
||||
# mv $< $@
|
||||
# sed -e 's/devset =/devset-selected:/' \
|
||||
# -e 's/testset =/testset-selected:/' \
|
||||
# -e 's/total size of shuffled dev data:/total-size-shuffled:/' < $@ |\
|
||||
# grep -v 'unused dev/test' > $<
|
||||
# touch $@
|
||||
|
||||
|
||||
|
||||
# fix-yml-files: ${OLDYMLFILES}
|
||||
|
@ -68,8 +68,8 @@ WORKHOME = ${PWD}/work
|
||||
|
||||
|
||||
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
|
||||
# CSCPROJECT = project_2002688
|
||||
CSCPROJECT = project_2003093
|
||||
CSCPROJECT = project_2002688
|
||||
# CSCPROJECT = project_2003093
|
||||
# CSCPROJECT = project_2002982
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
APPLHOME = /projappl/project_2003093/
|
||||
@ -182,7 +182,6 @@ MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
|
||||
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
|
||||
|
||||
## BPE
|
||||
SUBWORD_BPE ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
|
||||
SUBWORD_HOME ?= ${dir ${SUBWORD_BPE}}
|
||||
|
@ -270,6 +270,18 @@ endif
|
||||
# CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
||||
|
||||
|
||||
## train on back-translations only
|
||||
%-btonly:
|
||||
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
|
||||
${MAKE} DATASET=${DATASET}+btonly \
|
||||
USE_BACKTRANS=1 \
|
||||
CONTINUE_EXISTING=1 \
|
||||
MODELCONFIG=config-bt.mk \
|
||||
TRAINSET= TATOEBA_TRAINSET= \
|
||||
${@:-btonly=}
|
||||
|
||||
|
||||
|
||||
|
||||
PIVOT_MODEL = ${MODEL_SUBDIR}${DATASET}+pivot${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
|
||||
PIVOT_MODEL_BASE = ${PIVOT_MODEL}.${MODELTYPE}.model${NR}
|
||||
|
@ -13,12 +13,18 @@
|
||||
## - should we increase the length filter when cleaning later? How much?
|
||||
## - should we apply some other cleanup scripts here to get rid of some messy stuff?
|
||||
|
||||
|
||||
## should we remove zero-width spaces?
|
||||
## perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
|
||||
|
||||
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
||||
cat ${word 1,$^} |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
||||
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.1
|
||||
cat ${word 2,$^} |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
||||
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.2
|
||||
paste $@.1 $@.2 |\
|
||||
scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||
|
@ -106,6 +106,24 @@ TATOEBA_WORK ?= ${PWD}/work-tatoeba
|
||||
TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE}
|
||||
TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono
|
||||
|
||||
|
||||
## data count files (file basename)
|
||||
TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT}/data/release/${TATOEBA_VERSION}/released-bitexts
|
||||
|
||||
## all released language pairs with test sets > 200 test pairs
|
||||
## also extract all source languages that are available for a give target language
|
||||
## and vice versa
|
||||
TATOEBA_RELEASED_DATA = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
|
||||
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
|
||||
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
|
||||
|
||||
## extract language pairs for a specific subset
|
||||
TATOEBA_SUBSET = lower
|
||||
TATOEBA_RELEASED_SUBSET = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
|
||||
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
|
||||
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
|
||||
|
||||
|
||||
WIKILANGS ?= ${notdir ${wildcard backtranslate/wiki-iso639-3/*}}
|
||||
WIKIMACROLANGS ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}})
|
||||
|
||||
@ -119,7 +137,7 @@ TATOEBA_TESTSET_NAME = Tatoeba-test
|
||||
TATOEBA_RELEASEDIR = ${PWD}/models-tatoeba
|
||||
TATOEBA_MODELSHOME = ${PWD}/models-tatoeba
|
||||
TATOEBA_BTHOME = ${PWD}/bt-tatoeba
|
||||
|
||||
TATOEBA_MIN_BLEU = 10
|
||||
|
||||
## file with the source and target languages in the current model
|
||||
|
||||
@ -163,7 +181,7 @@ TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
|
||||
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
|
||||
ALT_MODEL_DIR=tatoeba \
|
||||
SKIP_DATA_DETAILS=1 \
|
||||
MIN_BLEU_SCORE=10
|
||||
MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU}
|
||||
|
||||
|
||||
|
||||
@ -340,6 +358,92 @@ tatoeba-refresh-finished:
|
||||
done
|
||||
|
||||
|
||||
###########################################################################################
|
||||
# start combinations with a specific source/target language
|
||||
###########################################################################################
|
||||
#
|
||||
# make SRC=deu tatoeba-src2all-reasonable
|
||||
# make SRC=deu tatoeba-src2all-small
|
||||
#
|
||||
# make TRG=deu tatoeba-all2trg-reasonable
|
||||
# make TRG=deu tatoeba-all2trg-small
|
||||
#
|
||||
|
||||
|
||||
tatoeba-src2all:
|
||||
for l in ${TATOEBA_AVAILABLE_SUBSET_TRG}; do \
|
||||
${MAKE} tatoeba-${SRC}2$$l-train; \
|
||||
done
|
||||
|
||||
tatoeba-src2langgroup:
|
||||
for l in ${sort ${shell langgroup -p -n ${TATOEBA_AVAILABLE_SUBSET_TRG} 2>/dev/null}}; do \
|
||||
${MAKE} tatoeba-${SRC}2$$l-train-1m; \
|
||||
done
|
||||
|
||||
|
||||
tatoeba-all2trg:
|
||||
for l in ${TATOEBA_AVAILABLE_SUBSET_SRC}; do \
|
||||
${MAKE} tatoeba-$${l}2${TRG}-train; \
|
||||
done
|
||||
|
||||
tatoeba-all2trg-print:
|
||||
for l in ${TATOEBA_AVAILABLE_SUBSET_SRC}; do \
|
||||
echo "${MAKE} tatoeba-$${l}2${TRG}-train"; \
|
||||
done
|
||||
|
||||
|
||||
tatoeba-langgroup2trg:
|
||||
for l in ${sort ${shell langgroup -p -n ${TATOEBA_AVAILABLE_SUBSET_SRC} 2>/dev/null}}; do \
|
||||
${MAKE} tatoeba-$${l}2${TRG}-train-1m; \
|
||||
done
|
||||
|
||||
|
||||
## all subsets
|
||||
|
||||
tatoeba-src2all-subsets:
|
||||
${MAKE} TATOEBA_SUBSET=lowest tatoeba-src2all
|
||||
${MAKE} TATOEBA_SUBSET=lower tatoeba-src2all
|
||||
${MAKE} TATOEBA_SUBSET=medium tatoeba-src2all
|
||||
${MAKE} TATOEBA_SUBSET=higher tatoeba-src2all
|
||||
${MAKE} TATOEBA_SUBSET=highest tatoeba-src2all
|
||||
|
||||
tatoeba-all2trg-subsets:
|
||||
${MAKE} TATOEBA_SUBSET=lowest tatoeba-all2trg
|
||||
${MAKE} TATOEBA_SUBSET=lower tatoeba-all2trg
|
||||
${MAKE} TATOEBA_SUBSET=medium tatoeba-all2trg
|
||||
${MAKE} TATOEBA_SUBSET=higher tatoeba-all2trg
|
||||
${MAKE} TATOEBA_SUBSET=highest tatoeba-all2trg
|
||||
|
||||
|
||||
## reasonable size (all except lower and lowest)
|
||||
|
||||
tatoeba-src2all-reasonable:
|
||||
${MAKE} TATOEBA_SUBSET=medium tatoeba-src2all
|
||||
${MAKE} TATOEBA_SUBSET=higher tatoeba-src2all
|
||||
${MAKE} TATOEBA_SUBSET=highest tatoeba-src2all
|
||||
|
||||
tatoeba-all2trg-reasonable:
|
||||
${MAKE} TATOEBA_SUBSET=medium tatoeba-all2trg
|
||||
${MAKE} TATOEBA_SUBSET=higher tatoeba-all2trg
|
||||
${MAKE} TATOEBA_SUBSET=highest tatoeba-all2trg
|
||||
|
||||
|
||||
## backoff to multilingual models and language groups
|
||||
## lower / lowest resource languages and zero-shot
|
||||
|
||||
tatoeba-src2all-small:
|
||||
${MAKE} TATOEBA_SUBSET=lower tatoeba-src2langgroup
|
||||
${MAKE} TATOEBA_SUBSET=lowest tatoeba-src2langgroup
|
||||
${MAKE} TATOEBA_SUBSET=zero-shot tatoeba-src2langgroup
|
||||
|
||||
tatoeba-all2trg-small:
|
||||
${MAKE} TATOEBA_SUBSET=lower tatoeba-langgroup2trg
|
||||
${MAKE} TATOEBA_SUBSET=lowest tatoeba-langgroup2trg
|
||||
${MAKE} TATOEBA_SUBSET=zero-shot tatoeba-langgroup2trg
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
###########################################################################################
|
||||
# models for backtranslation
|
||||
@ -1732,11 +1836,12 @@ ${TATOEBA_MONO}/%.labels:
|
||||
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
|
||||
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
|
||||
grep -P "$$s\t$$t\t" | cut -f3,4 |\
|
||||
scripts/filter/filter-korean.sh ${SRC} ${TRG} $$d > ${dir $@}Tatoeba-$$d.$$s-$$t; \
|
||||
if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
|
||||
echo "........ make ${dir $@}Tatoeba-$$d.$$s-$$t.clean.*.gz"; \
|
||||
cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
|
||||
cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
|
||||
cut -f1 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
|
||||
cut -f2 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
|
||||
fi; \
|
||||
rm -f ${dir $@}Tatoeba-$$d.$$s-$$t; \
|
||||
fi \
|
||||
@ -1748,11 +1853,12 @@ ${TATOEBA_MONO}/%.labels:
|
||||
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
|
||||
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
|
||||
grep -P "$$s\t$$t\t" | cut -f3,4 |\
|
||||
scripts/filter/filter-korean.sh ${TRG} ${SRC} $$d > ${dir $@}Tatoeba-$$d.$$t-$$s; \
|
||||
if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
|
||||
echo "........ make ${dir $@}Tatoeba-$$d.$$t-$$s.clean.*.gz"; \
|
||||
cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
|
||||
cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
|
||||
cut -f1 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
|
||||
cut -f2 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
|
||||
fi; \
|
||||
rm -f ${dir $@}Tatoeba-$$d.$$t-$$s; \
|
||||
fi \
|
||||
|
@ -9,6 +9,11 @@ as-en:
|
||||
${MAKE} reverse-data-as-en
|
||||
${MAKE} train-dynamic-en-as
|
||||
|
||||
|
||||
BCL_DEVSIZE = 1000
|
||||
BCL_TESTSIZE = 1000
|
||||
|
||||
|
||||
en-bcl:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia all-job
|
||||
|
||||
@ -29,25 +34,31 @@ bcl-en-nt:
|
||||
DEVSET=wikimedia all-job
|
||||
|
||||
%-en-bcl:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
|
||||
DEVSET=wikimedia \
|
||||
DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
|
||||
USE_REST_DEVDATA=1 ${@:-en-bcl=}
|
||||
|
||||
|
||||
%-bcl-en:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-bcl-en=}
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
|
||||
DEVSET=wikimedia \
|
||||
DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} \
|
||||
USE_REST_DEVDATA=1 ${@:-bcl-en=}
|
||||
|
||||
|
||||
%-en-bcl-nt:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
|
||||
DATASET=${DATASET}+nt \
|
||||
EXTRA_TRAINSET="new-testament" \
|
||||
DEVSET=wikimedia \
|
||||
DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
|
||||
${@:-en-bcl-nt=}
|
||||
|
||||
%-bcl-en-nt:
|
||||
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
|
||||
DATASET=${DATASET}+nt \
|
||||
EXTRA_TRAINSET="new-testament" \
|
||||
DEVSET=wikimedia \
|
||||
DEVSET=wikimedia DEVSIZE=${BCL_DEVSIZE} TESTSIZE=${BCL_TESTSIZE} USE_REST_DEVDATA=1 \
|
||||
${@:-bcl-en-nt=}
|
||||
|
||||
|
||||
@ -72,34 +83,228 @@ ENBCL_BPE = 1000
|
||||
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# start jobs for all languages where we have back-translations into English
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
## languages for which we have back translated wiki data into English
|
||||
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul bos_Latn cmn_Hans cmn_Hant hrv ind nno nob srp_Cyrl srp_Latn
|
||||
|
||||
|
||||
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul
|
||||
|
||||
|
||||
## start jobs for all languages where we have back-translations
|
||||
|
||||
## start jobs for all languages where we have back-translations into English
|
||||
wiki-eng2all-with-bt:
|
||||
for l in ${WIKI_BT2ENG}; do \
|
||||
if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
|
||||
echo "fetch back-translations for $$l-eng"; \
|
||||
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
|
||||
echo "start training eng-$$l with backtranslation data"; \
|
||||
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
|
||||
echo "fetch $$l wiki backtranslations"; \
|
||||
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
|
||||
done
|
||||
for l in ${sort ${shell iso639 -m -n ${WIKI_BT2ENG}}}; do \
|
||||
echo "start training eng-$$l with backtranslation data"; \
|
||||
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
|
||||
done
|
||||
|
||||
# for l in ${WIKI_BT2ENG}; do \
|
||||
# if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
# if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
|
||||
# echo "fetch back-translations for $$l-eng"; \
|
||||
# ${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
|
||||
# echo "start training eng-$$l with backtranslation data"; \
|
||||
# ${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
|
||||
# fi \
|
||||
# fi \
|
||||
# done
|
||||
|
||||
wiki-eng2all-with-bt-continue:
|
||||
for l in ${WIKI_BT2ENG}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ ! `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training eng-$$l with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-eng2$$l-train-bt; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
WIKI_BT2ENG_PARENTS = ${sort ${shell langgroup -p ${WIKI_BT2ENG}}}
|
||||
wiki-eng2all-with-bt-eval:
|
||||
for l in ${WIKI_BT2ENG}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} EMAIL= WALLTIME=4 tatoeba-eng2$$l-evalall-bt.submit; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
wiki-eng2allgroups-with-bt:
|
||||
for l in $(filter-out roa,${WIKI_BT2ENG_PARENTS}); do \
|
||||
# if [ `find work-tatoeba/eng-$$l -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
|
||||
wiki-eng2all-with-bt-dist:
|
||||
for l in ${WIKI_BT2ENG}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
echo "mv work-tatoeba/eng-$$l work-tatoeba-old"; \
|
||||
mv work-tatoeba/eng-$$l work-tatoeba-old; \
|
||||
fi; \
|
||||
echo "start training eng-$$l with backtranslation data"; \
|
||||
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
|
||||
if [ `find work-tatoeba/eng-$$l -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training eng-$$l with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# models for translating English into language groups with backtranslations
|
||||
# (does not fetch back-translations - they need to be available in bt-tatoeba!)
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
WIKI_BT2ENG_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_BT2ENG} | xargs langgroup -p}}
|
||||
|
||||
wiki-eng2allgroups-with-bt:
|
||||
for l in ${WIKI_BT2ENG_PARENTS}; do \
|
||||
echo "start training eng-$$l with backtranslation data"; \
|
||||
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
|
||||
done
|
||||
|
||||
wiki-eng2allgroups-with-bt-continue:
|
||||
for l in ${WIKI_BT2ENG_PARENTS}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ ! `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training eng-$$l with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-eng2$$l-train-bt-1m; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
wiki-eng2allgroups-with-bt-eval:
|
||||
for l in ${WIKI_BT2ENG_PARENTS}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} EMAIL= WALLTIME=8 tatoeba-eng2$$l-evalall-bt-1m.submit; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
# if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
|
||||
wiki-eng2allgroups-with-bt-dist:
|
||||
for l in ${WIKI_BT2ENG_PARENTS}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ `find work-tatoeba/eng-$$l -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training eng-$$l with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-eng2$$l-dist-bt-1m; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# start jobs for all languages where we have back-translations from English
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
## languages for which we have back translated wiki data from English
|
||||
|
||||
WIKI_ENG2BT = afr ara aze bel ben bos_Latn bre bul cat ceb ces cmn_Hans cmn_Hant cym dan deu ell epo est eus fao fin fra fry gle glg heb hin hrv hun hye ido ilo ina ind isl ita lav lit ltz mal mar mkd mlt msa nds nld nno nob pol por ron run rus spa sqi srp_Cyrl srp_Latn swa swe tam tgl tha tur ukr urd uzb_Latn vie war zho zsm_Latn
|
||||
|
||||
|
||||
wiki-all2eng-with-bt:
|
||||
for l in ${WIKI_ENG2BT}; do \
|
||||
echo "fetch $$l wiki backtranslations"; \
|
||||
${MAKE} -C bt-tatoeba TRG=$$l SRC=eng fetch-bt; \
|
||||
done
|
||||
for l in ${sort ${shell iso639 -m -n ${WIKI_ENG2BT}}}; do \
|
||||
if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
|
||||
else \
|
||||
echo "start training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt.submitcpu; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
wiki-all2eng-with-bt-continue:
|
||||
for l in ${WIKI_ENG2BT}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ ! `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
wiki-all2eng-with-bt-eval:
|
||||
for l in ${WIKI_ENG2BT}; do \
|
||||
if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt.submit; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
# if [ `find work-tatoeba/$$l-eng -name 'opus+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
|
||||
wiki-all2eng-with-bt-dist:
|
||||
for l in ${WIKI_ENG2BT}; do \
|
||||
if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
if [ `find work-tatoeba/$$l-eng -name 'opus+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
WIKI_ENG2BT_PARENTS = ${sort ${shell iso639 -m -n ${WIKI_ENG2BT} | xargs langgroup -p}}
|
||||
|
||||
wiki-allgroups2eng-with-bt:
|
||||
for l in ${WIKI_ENG2BT_PARENTS}; do \
|
||||
if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
|
||||
else \
|
||||
echo "start training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= HPC_MEM=32g HPC_CORES=4 tatoeba-$${l}2eng-train-bt-1m.submitcpu; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
wiki-allgroups2eng-with-bt-continue:
|
||||
for l in ${WIKI_ENG2BT_PARENTS}; do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
if [ ! `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-$${l}2eng-train-bt-1m; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
wiki-allgroups2eng-with-bt-eval:
|
||||
for l in ${WIKI_ENG2BT_PARENTS}; do \
|
||||
if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} EMAIL= WALLTIME=4 tatoeba-$${l}2eng-evalall-bt-1m.submit; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
# if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt.*model1.done' | wc -l` -gt 0 ]; then \
|
||||
|
||||
wiki-allgroups2eng-with-bt-dist:
|
||||
for l in ${WIKI_ENG2BT_PARENTS}; do \
|
||||
if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
if [ `find work-tatoeba/$$l-eng -name 'opus1m+bt*best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
||||
echo "continue training $$l-eng with backtranslation data"; \
|
||||
${MAKE} EMAIL= tatoeba-$${l}2eng-dist-bt-1m; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
@ -1,3 +1,8 @@
|
||||
#
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus-2020-01-20.zip
|
||||
|
||||
@ -15,6 +20,11 @@
|
||||
| JW300.bcl.en | 56.8 | 0.705 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus-2020-02-11.zip
|
||||
|
||||
* dataset: opus
|
||||
@ -31,6 +41,11 @@
|
||||
| JW300.bcl.en | 56.1 | 0.697 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+bt-2020-05-23.zip
|
||||
|
||||
* dataset: opus+bt
|
||||
@ -58,6 +73,11 @@
|
||||
| JW300.bcl.en | 57.6 | 0.712 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt-2021-03-29.zip
|
||||
|
||||
* dataset: opus+nt
|
||||
@ -92,3 +112,190 @@
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 10.4 | 0.320 | 525 | 27109 | 0.477 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt-2021-04-01.zip
|
||||
|
||||
* dataset: opus+nt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): bcl
|
||||
* target language(s): en
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
||||
* download: [opus+nt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.zip)
|
||||
## Training data: opus+nt+bt
|
||||
|
||||
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikinews.aa.en-bcl (357946)
|
||||
* bcl-en: total size = 1809858
|
||||
* total size (opus+nt+bt): 1809767
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.test.txt)
|
||||
* test set scores: [opus+nt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-01.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 28.2 | 0.498 | 525 | 27109 | 0.799 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt+bt-2021-04-03.zip
|
||||
|
||||
* dataset: opus+nt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): bcl
|
||||
* target language(s): en
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
||||
* download: [opus+nt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.zip)
|
||||
## Training data: opus+nt+bt+bt
|
||||
|
||||
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
|
||||
* bcl-en: total size = 4730330
|
||||
* total size (opus+nt+bt+bt): 4730231
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.test.txt)
|
||||
* test set scores: [opus+nt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt-2021-04-03.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 16.2 | 0.461 | 525 | 27109 | 1.000 |
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt+bt+bt-2021-04-05.zip
|
||||
|
||||
* dataset: opus+nt+bt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): bcl
|
||||
* target language(s): en
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
||||
* download: [opus+nt+bt+bt+bt-2021-04-05.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.zip)
|
||||
## Training data: opus+nt+bt+bt+bt
|
||||
|
||||
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
|
||||
* bcl-en: total size = 4730330
|
||||
* total size (opus+nt+bt+bt+bt): 4730224
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt+bt+bt-2021-04-05.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.test.txt)
|
||||
* test set scores: [opus+nt+bt+bt+bt-2021-04-05.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt+bt+bt-2021-04-05.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 24.2 | 0.497 | 525 | 27109 | 1.000 |
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt-2021-04-09.zip
|
||||
|
||||
* dataset: opus+nt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): bcl
|
||||
* target language(s): en
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
||||
* download: [opus+nt+bt-2021-04-09.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.zip)
|
||||
## Training data: opus+nt+bt
|
||||
|
||||
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
|
||||
* bcl-en: total size = 4730330
|
||||
* unused dev/test data is added to training data
|
||||
* total size (opus+nt+bt): 4731419
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 2767
|
||||
* total-size-shuffled: 1966
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 500 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt-2021-04-09.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.test.txt)
|
||||
* test set scores: [opus+nt+bt-2021-04-09.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-09.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 33.5 | 0.562 | 500 | 28621 | 0.868 |
|
||||
|
||||
|
||||
# opus+nt+bt-2021-04-12.zip
|
||||
|
||||
* dataset: opus+nt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): bcl
|
||||
* target language(s): en
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
||||
* download: [opus+nt+bt-2021-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.zip)
|
||||
## Training data: opus+nt+bt
|
||||
|
||||
* bcl-en: JW300 (470468) new-testament (11623) wiki.aa.en-bcl (969821) wikibooks.aa.en-bcl (985129) wikinews.aa.en-bcl (357946) wikiquote.aa.en-bcl (987266) wikisource.aa.en-bcl (948077)
|
||||
* bcl-en: total size = 4730330
|
||||
* unused dev/test data is added to training data
|
||||
* total size (opus+nt+bt): 4732437
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 5033
|
||||
* total-size-shuffled: 4207
|
||||
|
||||
* devset-selected: top 1000 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 1000 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt-2021-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.test.txt)
|
||||
* test set scores: [opus+nt+bt-2021-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt+bt-2021-04-12.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 31.5 | 0.523 | 1000 | 31520 | 0.836 |
|
||||
|
||||
|
@ -1,86 +1,261 @@
|
||||
# wikimedia-2020-01-17.zip
|
||||
# opus+nt+bt-2021-03-30.zip
|
||||
|
||||
* dataset: wikimedia
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [wikimedia-2020-01-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.zip)
|
||||
* test set translations: [wikimedia-2020-01-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.test.txt)
|
||||
* test set scores: [wikimedia-2020-01-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/wikimedia-2020-01-17.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.en.bcl | 55.3 | 0.729 |
|
||||
|
||||
# opus-2020-01-20.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2020-01-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.zip)
|
||||
* test set translations: [opus-2020-01-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.test.txt)
|
||||
* test set scores: [opus-2020-01-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-01-20.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.en.bcl | 55.3 | 0.729 |
|
||||
|
||||
# opus-2020-02-11.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
|
||||
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
|
||||
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.en.bcl | 53.8 | 0.719 |
|
||||
|
||||
# opus+bt-2020-02-26.zip
|
||||
|
||||
* dataset: opus+bt
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus+bt-2020-02-26.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.zip)
|
||||
* test set translations: [opus+bt-2020-02-26.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.test.txt)
|
||||
* test set scores: [opus+bt-2020-02-26.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-02-26.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.en.bcl | 54.3 | 0.722 |
|
||||
|
||||
# opus+bt-2020-05-23.zip
|
||||
|
||||
* dataset: opus+bt
|
||||
* dataset: opus+nt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
|
||||
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.zip)
|
||||
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.test.txt)
|
||||
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+bt-2020-05-23.eval.txt)
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus+nt+bt-2021-03-30.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.zip)
|
||||
## Training data: opus+nt+bt
|
||||
|
||||
## Training data: opus+bt
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (43432)
|
||||
* en-bcl: total size = 525523
|
||||
* total size (opus+nt+bt): 525475
|
||||
|
||||
* en-bcl: wikimedia (1106)
|
||||
* en-bcl: total size = 1106
|
||||
* unused dev/test data is added to training data
|
||||
* total size (opus+bt): 458304
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt-2021-03-30.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.test.txt)
|
||||
* test set scores: [opus+nt+bt-2021-03-30.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-03-30.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.en.bcl | 55.7 | 0.736 |
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 17.3 | 0.426 | 525 | 28399 | 0.840 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt+bt-2021-04-01.zip
|
||||
|
||||
* dataset: opus+nt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus+nt+bt+bt-2021-04-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.zip)
|
||||
## Training data: opus+nt+bt+bt
|
||||
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474)
|
||||
* en-bcl: total size = 527565
|
||||
* total size (opus+nt+bt+bt): 527524
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt+bt-2021-04-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.test.txt)
|
||||
* test set scores: [opus+nt+bt+bt-2021-04-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-01.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 21.6 | 0.476 | 525 | 28399 | 0.789 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt+bt+bt-2021-04-03.zip
|
||||
|
||||
* dataset: opus+nt+bt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.zip)
|
||||
## Training data: opus+nt+bt+bt+bt
|
||||
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474)
|
||||
* en-bcl: total size = 527565
|
||||
* total size (opus+nt+bt+bt+bt): 527496
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.test.txt)
|
||||
* test set scores: [opus+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt-2021-04-03.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 22.7 | 0.482 | 525 | 28399 | 0.895 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# opus2+nt+bt+bt+bt-2021-04-03.zip
|
||||
|
||||
* dataset: opus2+nt+bt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus2+nt+bt+bt+bt-2021-04-03.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.zip)
|
||||
## Training data: opus2+nt+bt+bt+bt
|
||||
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
|
||||
* en-bcl: total size = 573039
|
||||
* total size (opus2+nt+bt+bt+bt): 572969
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus2+nt+bt+bt+bt-2021-04-03.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.test.txt)
|
||||
* test set scores: [opus2+nt+bt+bt+bt-2021-04-03.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus2+nt+bt+bt+bt-2021-04-03.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 23.9 | 0.497 | 525 | 28399 | 0.820 |
|
||||
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt+bt+bt+bt-2021-04-06.zip
|
||||
|
||||
* dataset: opus+nt+bt+bt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus+nt+bt+bt+bt+bt-2021-04-06.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.zip)
|
||||
## Training data: opus+nt+bt+bt+bt+bt
|
||||
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
|
||||
* en-bcl: total size = 618513
|
||||
* total size (opus+nt+bt+bt+bt+bt): 618427
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt+bt+bt+bt-2021-04-06.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.test.txt)
|
||||
* test set scores: [opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt+bt+bt-2021-04-06.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 24.4 | 0.498 | 525 | 28399 | 0.805 |
|
||||
|
||||
|
||||
|
||||
# opus+nt+bt+bt-2021-04-10.zip
|
||||
|
||||
* dataset: opus+nt+bt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus+nt+bt+bt-2021-04-10.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.zip)
|
||||
## Training data: opus+nt+bt+bt
|
||||
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
|
||||
* en-bcl: total size = 664007
|
||||
* unused dev/test data is added to training data
|
||||
* total size (opus+nt+bt+bt): 665111
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 2767
|
||||
* total-size-shuffled: 1966
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 500 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt+bt-2021-04-10.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.test.txt)
|
||||
* test set scores: [opus+nt+bt+bt-2021-04-10.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt+bt-2021-04-10.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 30.7 | 0.572 | 500 | 29131 | 0.921 |
|
||||
|
||||
|
||||
# opus+nt+bt-2021-04-11.zip
|
||||
|
||||
* dataset: opus+nt+bt
|
||||
* model: transformer-align
|
||||
* source language(s): en
|
||||
* target language(s): bcl
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm12k,spm32k)
|
||||
* download: [opus+nt+bt-2021-04-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.zip)
|
||||
## Training data: opus+nt+bt
|
||||
|
||||
* en-bcl: JW300 (470468) new-testament (11623) wiki.aa (45494) wiki.aa_opus+nt+bt+bt+bt-2021-04-05 (45474) wiki.aa_opus+nt+bt+bt-2021-04-03 (45474) wiki.aa_opus+nt+bt-2021-04-01 (45474)
|
||||
* en-bcl: total size = 664007
|
||||
* unused dev/test data is added to training data
|
||||
* total size (opus+nt+bt): 666118
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 5033
|
||||
* total-size-shuffled: 4207
|
||||
|
||||
* devset-selected: top 1000 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 1000 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt+bt-2021-04-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.test.txt)
|
||||
* test set scores: [opus+nt+bt-2021-04-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus+nt+bt-2021-04-11.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.en-bcl | 31.9 | 0.585 | 1000 | 27681 | 1.000 |
|
||||
|
||||
|
31
scripts/evaluate/compare-bt.pl
Normal file
31
scripts/evaluate/compare-bt.pl
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
my %basemodel = ();
|
||||
my %btmodel = ();
|
||||
|
||||
while (<>){
|
||||
chomp;
|
||||
s/https:\/\/object\.pouta\.csc\.fi\/Tatoeba\-MT\-models\///;
|
||||
my @fields = split(/\t/);
|
||||
if ($fields[3]=~/^(.*)\+bt-....-..-..\.zip/){
|
||||
unless (exists $btmodel{"$fields[0]\t$1"}){
|
||||
$btmodel{"$fields[0]\t$1"} = $_;
|
||||
}
|
||||
}
|
||||
elsif ($fields[3]=~/^(.*)-....-..-..\.zip/){
|
||||
unless (exists $basemodel{"$fields[0]\t$1"}){
|
||||
$basemodel{"$fields[0]\t$1"} = $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (sort keys %btmodel){
|
||||
if (exists $basemodel{$_} and $btmodel{$_}){
|
||||
print "base\t", $basemodel{$_},"\n";
|
||||
print "base+bt\t", $btmodel{$_},"\n";
|
||||
my @base = split(/\t/,$basemodel{$_});
|
||||
my @bt = split(/\t/,$btmodel{$_});
|
||||
$bt[1] = sprintf("%5.3f",$bt[1] - $base[1]);
|
||||
$bt[2] = sprintf("%5.2f",$bt[2] - $base[2]);
|
||||
print "diff\t", join("\t",@bt),"\n\n";
|
||||
}
|
||||
}
|
54
scripts/filter/filter-korean.sh
Executable file
54
scripts/filter/filter-korean.sh
Executable file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/bash
|
||||
#
|
||||
# extra filtering for Korean data
|
||||
# filter out data that has characters other than Hang
|
||||
#
|
||||
# USAGE: filter-korean.sh srclangid trglangid < tab-sepatared-bitext > filtered-bitext
|
||||
#
|
||||
|
||||
|
||||
tmpsrc=`mktemp`
|
||||
tmptrg=`mktemp`
|
||||
tmplang=`mktemp`
|
||||
|
||||
|
||||
if [ "$1" == "kor" ] || [ "$1" == "ko" ]; then
|
||||
column=1
|
||||
elif [ "$2" == "kor" ] || [ "$2" == "ko" ]; then
|
||||
column=2
|
||||
fi
|
||||
|
||||
## don't touch test sets
|
||||
if [ "$3" == "test" ]; then
|
||||
column=0
|
||||
fi
|
||||
|
||||
|
||||
if [ $column -gt 0 ]; then
|
||||
echo "... filter Korean bitexts" >&2
|
||||
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
|
||||
else
|
||||
cat
|
||||
fi
|
||||
|
||||
|
||||
## OLD: check script
|
||||
## this is slow ....
|
||||
|
||||
# if [ $column -gt 0 ]; then
|
||||
# echo "... filter Korean bitexts ($tmplang $tmpsrc $tmptrg)" >&2
|
||||
# perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |
|
||||
# tee >(cut -f1 > $tmpsrc) >(cut -f2 > $tmptrg) |
|
||||
# cut -f$column |
|
||||
# perl -CIOE -pe 'use utf8;s/\p{P}//g;s/[^\S\n]//g;s/▁//g;s/[0-9]//g' |
|
||||
# langscript -a > $tmplang
|
||||
|
||||
# paste $tmplang $tmpsrc $tmptrg |
|
||||
# grep $'Hang ([0-9]*)\s*\t' |
|
||||
# cut -f2,3
|
||||
|
||||
# rm -f $tmplang $tmpsrc $tmptrg
|
||||
# else
|
||||
# cat
|
||||
# fi
|
||||
|
@ -93,6 +93,7 @@ else
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
|
||||
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
|
@ -53,6 +53,7 @@ sed -e 's/,/,/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
|
||||
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user