mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
train with backtranslations
This commit is contained in:
parent
0185534823
commit
596cae8922
218
Makefile
218
Makefile
@ -91,6 +91,7 @@ include Makefile.dist
|
||||
include Makefile.tasks
|
||||
include Makefile.data
|
||||
include Makefile.doclevel
|
||||
include Makefile.generic
|
||||
include Makefile.slurm
|
||||
|
||||
|
||||
@ -133,17 +134,6 @@ translate-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${
|
||||
eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
|
||||
|
||||
|
||||
## resume training on an existing model
|
||||
resume:
|
||||
if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \
|
||||
cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \
|
||||
${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \
|
||||
fi
|
||||
sleep 1
|
||||
rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
||||
${MAKE} train
|
||||
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
# translate and evaluate all test sets in testsets/
|
||||
#------------------------------------------------------------------------
|
||||
@ -152,7 +142,6 @@ resume:
|
||||
## and all trokenized test sets that can be found in that directory
|
||||
TESTSET_HOME = ${PWD}/testsets
|
||||
TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG}
|
||||
# TESTSETS = $(patsubst ${TESTSET_DIR}/%.${SRC}.tok.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.tok.gz})
|
||||
TESTSETS = $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})
|
||||
TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})})
|
||||
TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})})
|
||||
@ -190,197 +179,17 @@ finished:
|
||||
fi
|
||||
|
||||
|
||||
## extension -all: run something over all language pairs, e.g.
|
||||
## make wordalign-all
|
||||
## this goes sequentially over all language pairs
|
||||
## for the parallelizable version of this: look at %-all-parallel
|
||||
%-all:
|
||||
for l in ${ALL_LANG_PAIRS}; do \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \
|
||||
done
|
||||
|
||||
# run something over all language pairs that have trained models
|
||||
## - make eval-allmodels
|
||||
## - make dist-allmodels
|
||||
%-allmodels:
|
||||
for l in ${ALL_LANG_PAIRS}; do \
|
||||
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
## only bilingual models
|
||||
%-allbilingual:
|
||||
for l in ${ALL_BILINGUAL_MODELS}; do \
|
||||
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
## only bilingual models
|
||||
%-allmultilingual:
|
||||
for l in ${ALL_MULTILINGUAL_MODELS}; do \
|
||||
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
## run something over all language pairs but make it possible to do it in parallel, for example
|
||||
## - make dist-all-parallel
|
||||
%-all-parallel:
|
||||
${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}})
|
||||
|
||||
## run a command that includes the langpair, for example
|
||||
## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv"
|
||||
## What is this good for?
|
||||
## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially
|
||||
%-run-for-langpair:
|
||||
${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
|
||||
TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
|
||||
${shell echo $@ | sed 's/__.*$$//'}
|
||||
|
||||
|
||||
## right-to-left model
|
||||
%-RL:
|
||||
${MAKE} MODEL=${MODEL}-RL \
|
||||
MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \
|
||||
${@:-RL=}
|
||||
|
||||
|
||||
## run a multigpu job (2 or 4 GPUs)
|
||||
|
||||
%-multigpu %-0123:
|
||||
${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=})
|
||||
|
||||
%-twogpu %-gpu01:
|
||||
${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=})
|
||||
|
||||
%-gpu23:
|
||||
${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=}
|
||||
|
||||
|
||||
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
|
||||
%-cpu:
|
||||
${MAKE} MARIAN=${MARIANCPU} \
|
||||
LOADMODS='${LOADCPU}' \
|
||||
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
|
||||
${@:-cpu=}
|
||||
|
||||
|
||||
## document level models
|
||||
%-doc:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
|
||||
PRE=norm \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
|
||||
${@:-doc=}
|
||||
|
||||
|
||||
## sentence-piece models
|
||||
%-spm:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
|
||||
PRE=norm \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm=}
|
||||
|
||||
%-spm-noalign:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
|
||||
MODELTYPE=transformer \
|
||||
PRE=norm \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm-noalign=}
|
||||
|
||||
|
||||
|
||||
## BPE models
|
||||
%-bpe:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \
|
||||
PRE=tok \
|
||||
MODELTYPE=transformer \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe=}
|
||||
|
||||
%-bpe-align:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \
|
||||
PRE=tok \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe-align=}
|
||||
|
||||
%-bpe-memad:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \
|
||||
PRE=tok \
|
||||
MODELTYPE=transformer \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe-memad=}
|
||||
|
||||
%-bpe-old:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \
|
||||
PRE=tok \
|
||||
MODELTYPE=transformer \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe-old=}
|
||||
|
||||
|
||||
## for the inbuilt sentence-piece segmentation:
|
||||
# PRE_SRC=txt PRE_TRG=txt
|
||||
# MARIAN=${MARIAN}-spm
|
||||
# MODEL_VOCABTYPE=spm
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## continue document-level training with a new context size
|
||||
|
||||
ifndef NEW_CONTEXT
|
||||
NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE)))
|
||||
endif
|
||||
|
||||
continue-doctrain:
|
||||
mkdir -p ${WORKDIR}/${MODEL}
|
||||
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}})
|
||||
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz
|
||||
${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc
|
||||
|
||||
|
||||
|
||||
|
||||
## continue training with a new dataset
|
||||
|
||||
ifndef NEW_DATASET
|
||||
NEW_DATASET = OpenSubtitles
|
||||
endif
|
||||
|
||||
continue-datatrain:
|
||||
mkdir -p ${WORKDIR}/${MODEL}
|
||||
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}})
|
||||
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz
|
||||
if [ -e ${BPESRCMODEL} ]; then \
|
||||
cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \
|
||||
cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \
|
||||
## resume training on an existing model
|
||||
resume:
|
||||
if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \
|
||||
cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \
|
||||
${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \
|
||||
fi
|
||||
if [ -e ${SPMSRCMODEL} ]; then \
|
||||
cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \
|
||||
cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \
|
||||
fi
|
||||
${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train
|
||||
sleep 1
|
||||
rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
||||
${MAKE} train
|
||||
|
||||
|
||||
# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus"
|
||||
|
||||
|
||||
|
||||
@ -548,13 +357,6 @@ endif
|
||||
rm -f $@.input $@.output
|
||||
|
||||
|
||||
# %.eval: % ${TEST_TRG}
|
||||
# grep . ${TEST_TRG} > $@.ref
|
||||
# grep . $< > $@.sys
|
||||
# cat $@.sys | sacrebleu $@.ref > $@
|
||||
# cat $@.sys | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
# rm -f $@.ref $@.sys
|
||||
|
||||
|
||||
%.eval: % ${TEST_TRG}
|
||||
paste ${TEST_SRC}.${PRE_SRC} ${TEST_TRG} | grep $$'.\t' | cut -f2 > $@.ref
|
||||
@ -575,5 +377,3 @@ endif
|
||||
-e 's/&/&/g' |\
|
||||
sed 'n;n;G;' > $@
|
||||
rm -f $@.1 $@.2 $@.3
|
||||
|
||||
# paste -d "\n" ${TEST_SRC} ${TEST_TRG} ${<:.eval=} |\
|
||||
|
@ -215,6 +215,11 @@ MODEL_VOCABTYPE = yml
|
||||
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
|
||||
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
|
||||
## OPUS model (in case we want to continue training with other data)
|
||||
OPUSMODEL = ${MODEL_SUBDIR}opus${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
|
||||
OPUSMODEL_BASE = ${OPUSMODEL}.${MODELTYPE}.model${NR}
|
||||
OPUSMODEL_FINAL = ${WORKDIR}/${OPUSMODEL_BASE}.npz.best-perplexity.npz
|
||||
|
||||
|
||||
## test set translation and scores
|
||||
|
||||
|
@ -31,6 +31,10 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN
|
||||
|
||||
|
||||
|
||||
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
|
||||
|
||||
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRC}.gz}}
|
||||
BACKTRANS_TRG = ${patsubst %.${SRC}.gz,%.${TRG}.gz,${BACKTRANS_SRC}}
|
||||
|
||||
## make data in reverse direction without re-doing word alignment etc ...
|
||||
## ---> this is dangerous when things run in parallel
|
||||
|
@ -125,12 +125,6 @@ endif
|
||||
endif
|
||||
|
||||
|
||||
ttt:
|
||||
@echo ${PREPROCESS_SRCMODEL}
|
||||
@echo ${PREPROCESS_TRGMODEL}
|
||||
@echo ${PREPROCESS_SCRIPT}
|
||||
@echo ${POSTPROCESS_SCRIPT}
|
||||
|
||||
|
||||
${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
ifneq (${SKIP_DIST_EVAL},1)
|
||||
|
@ -4,6 +4,49 @@
|
||||
DOCLEVEL_BENCHMARK_DATA = https://zenodo.org/record/3525366/files/doclevel-MT-benchmark-discomt2019.zip
|
||||
|
||||
|
||||
|
||||
## continue document-level training with a new context size
|
||||
|
||||
ifndef NEW_CONTEXT
|
||||
NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE)))
|
||||
endif
|
||||
|
||||
continue-doctrain:
|
||||
mkdir -p ${WORKDIR}/${MODEL}
|
||||
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}})
|
||||
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz
|
||||
${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc
|
||||
|
||||
|
||||
|
||||
|
||||
## continue training with a new dataset
|
||||
|
||||
ifndef NEW_DATASET
|
||||
NEW_DATASET = OpenSubtitles
|
||||
endif
|
||||
|
||||
continue-datatrain:
|
||||
mkdir -p ${WORKDIR}/${MODEL}
|
||||
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}})
|
||||
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz
|
||||
if [ -e ${BPESRCMODEL} ]; then \
|
||||
cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \
|
||||
cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \
|
||||
fi
|
||||
if [ -e ${SPMSRCMODEL} ]; then \
|
||||
cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \
|
||||
cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \
|
||||
fi
|
||||
${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train
|
||||
|
||||
|
||||
# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## use the doclevel benchmark data sets
|
||||
%-ost:
|
||||
${MAKE} ost-datasets
|
||||
|
174
Makefile.generic
Normal file
174
Makefile.generic
Normal file
@ -0,0 +1,174 @@
|
||||
# -*-makefile-*-
|
||||
#
|
||||
# generic implic targets that make our life a bit easier
|
||||
|
||||
|
||||
|
||||
|
||||
## extension -all: run something over all language pairs, e.g.
|
||||
## make wordalign-all
|
||||
## this goes sequentially over all language pairs
|
||||
## for the parallelizable version of this: look at %-all-parallel
|
||||
%-all:
|
||||
for l in ${ALL_LANG_PAIRS}; do \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \
|
||||
done
|
||||
|
||||
# run something over all language pairs that have trained models
|
||||
## - make eval-allmodels
|
||||
## - make dist-allmodels
|
||||
%-allmodels:
|
||||
for l in ${ALL_LANG_PAIRS}; do \
|
||||
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
## only bilingual models
|
||||
%-allbilingual:
|
||||
for l in ${ALL_BILINGUAL_MODELS}; do \
|
||||
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
## only bilingual models
|
||||
%-allmultilingual:
|
||||
for l in ${ALL_MULTILINGUAL_MODELS}; do \
|
||||
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
## run something over all language pairs but make it possible to do it in parallel, for example
|
||||
## - make dist-all-parallel
|
||||
%-all-parallel:
|
||||
${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}})
|
||||
|
||||
## run a command that includes the langpair, for example
|
||||
## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv"
|
||||
## What is this good for?
|
||||
## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially
|
||||
%-run-for-langpair:
|
||||
${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
|
||||
TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
|
||||
${shell echo $@ | sed 's/__.*$$//'}
|
||||
|
||||
|
||||
## right-to-left model
|
||||
%-RL:
|
||||
${MAKE} MODEL=${MODEL}-RL \
|
||||
MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \
|
||||
${@:-RL=}
|
||||
|
||||
|
||||
|
||||
## include all backtranslation data as well in training
|
||||
## start from the pre-trained opus model if it exists
|
||||
|
||||
%-add-backtranslations:
|
||||
ifneq (${wildcard ${OPUSMODEL_FINAL}},)
|
||||
cp ${OPUSMODEL_FINAL} ${MODEL_BASENAME}.gz
|
||||
endif
|
||||
${MAKE} DATASET=opus+bt \
|
||||
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
||||
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
||||
${@:-add-backtranslations=}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## run a multigpu job (2 or 4 GPUs)
|
||||
|
||||
%-multigpu %-0123:
|
||||
${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=})
|
||||
|
||||
%-twogpu %-gpu01:
|
||||
${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=})
|
||||
|
||||
%-gpu23:
|
||||
${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=}
|
||||
|
||||
|
||||
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
|
||||
%-cpu:
|
||||
${MAKE} MARIAN=${MARIANCPU} \
|
||||
LOADMODS='${LOADCPU}' \
|
||||
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
|
||||
${@:-cpu=}
|
||||
|
||||
|
||||
## document level models
|
||||
%-doc:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
|
||||
PRE=norm \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
|
||||
${@:-doc=}
|
||||
|
||||
|
||||
## sentence-piece models
|
||||
%-spm:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
|
||||
PRE=norm \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm=}
|
||||
|
||||
%-spm-noalign:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
|
||||
MODELTYPE=transformer \
|
||||
PRE=norm \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm-noalign=}
|
||||
|
||||
|
||||
|
||||
## BPE models
|
||||
%-bpe:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \
|
||||
PRE=tok \
|
||||
MODELTYPE=transformer \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe=}
|
||||
|
||||
%-bpe-align:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \
|
||||
PRE=tok \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe-align=}
|
||||
|
||||
%-bpe-memad:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \
|
||||
PRE=tok \
|
||||
MODELTYPE=transformer \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe-memad=}
|
||||
|
||||
%-bpe-old:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \
|
||||
PRE=tok \
|
||||
MODELTYPE=transformer \
|
||||
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
||||
${@:-bpe-old=}
|
||||
|
||||
|
||||
## for the inbuilt sentence-piece segmentation:
|
||||
# PRE_SRC=txt PRE_TRG=txt
|
||||
# MARIAN=${MARIAN}-spm
|
||||
# MODEL_VOCABTYPE=spm
|
||||
|
||||
|
||||
|
||||
|
111
html/index.php
Normal file
111
html/index.php
Normal file
@ -0,0 +1,111 @@
|
||||
<?php
|
||||
|
||||
$lines = file("https://object.pouta.csc.fi/OPUS-MT-models/index.txt");
|
||||
foreach ($lines as $line){
|
||||
$line = rtrim($line);
|
||||
if (substr($line, -4) === '.zip'){
|
||||
$parts = explode('/',$line);
|
||||
$langs = explode('-',$parts[0]);
|
||||
if (strpos($langs[0],'+') || strpos($langs[1],'+')){
|
||||
$multilingual["$langs[0]-$langs[1]"]=1;
|
||||
$src = explode('+',$langs[0]);
|
||||
$trg = explode('+',$langs[1]);
|
||||
foreach ($src as $s){
|
||||
foreach ($trg as $t){
|
||||
if (!array_key_exists("$s$t",$models)){
|
||||
$models["$s$t"] = "$langs[0]-$langs[1]";
|
||||
$nrlangpairs++;
|
||||
}
|
||||
$nrmultipairs++;
|
||||
$srclangs[$s]=1;
|
||||
$trglangs[$t]=1;
|
||||
$languages[$s]=1;
|
||||
$languages[$t]=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
if (!array_key_exists("$langs[0]$langs[1]",$models)){
|
||||
$nrlangpairs++;
|
||||
}
|
||||
$bilingual["$langs[0]-$langs[1]"]=1;
|
||||
$models["$langs[0]$langs[1]"] = "$langs[0]-$langs[1]";
|
||||
// $models["$langs[0]$langs[1]"] = $line;
|
||||
$srclangs[$langs[0]]=1;
|
||||
$trglangs[$langs[1]]=1;
|
||||
$languages[$langs[0]]=1;
|
||||
$languages[$langs[1]]=1;
|
||||
$nrmodels++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ksort($languages);
|
||||
ksort($srclangs);
|
||||
ksort($trglangs);
|
||||
ksort($multilingual);
|
||||
|
||||
echo("<html><head></head><body>");
|
||||
echo("<h1>Pre-trained Opus-MT Models</h1><ul>");
|
||||
// echo("<li>Number of bilingual models: $nrmodels</li>");
|
||||
|
||||
echo("<li>Number of bilingual models: ");
|
||||
echo(count($bilingual));
|
||||
echo("</li>");
|
||||
|
||||
echo("<li>Number of multilingual models: ");
|
||||
echo(count($multilingual));
|
||||
echo("</li>");
|
||||
|
||||
echo("<li>Number of supported source languages: ");
|
||||
echo(count($srclangs));
|
||||
echo("</li>");
|
||||
echo("<li>Number of supported target languages: ");
|
||||
echo(count($trglangs));
|
||||
echo("</li>");
|
||||
|
||||
echo("<li>Number of supported language pairs: $nrlangpairs</li>");
|
||||
echo("<li>Language pairs supported by multilingual models: $nrmultipairs</li>");
|
||||
echo('</ul>');
|
||||
|
||||
|
||||
echo("<h2>Multilingual models</h2><ul>");
|
||||
foreach ($multilingual as $model => $c){
|
||||
echo("<li><a href=\"https://github.com/Helsinki-NLP/OPUS-MT-train/tree/master/models/$model\">$model</a></li>");
|
||||
}
|
||||
echo('</ul>');
|
||||
|
||||
|
||||
echo("<h2>Language pairs</h2><ul>");
|
||||
|
||||
echo('<table><tr><th></th>');
|
||||
foreach ($trglangs as $language => $count){
|
||||
echo '<th>';
|
||||
echo $language;
|
||||
echo '</th>';
|
||||
}
|
||||
echo('</tr>');
|
||||
foreach ($srclangs as $src => $count){
|
||||
echo "<tr><td>$src</td>";
|
||||
foreach ($trglangs as $trg => $count){
|
||||
if (array_key_exists("$src$trg",$models)){
|
||||
echo("<td><a href=\"https://github.com/Helsinki-NLP/OPUS-MT-train/tree/master/models/");
|
||||
echo($models["$src$trg"]);
|
||||
if ($models["$src$trg"] != "$src-$trg"){
|
||||
echo("\">multi</a></td>\n");
|
||||
}
|
||||
else{
|
||||
echo("\">$src$trg</a></td>\n");
|
||||
}
|
||||
}
|
||||
else{
|
||||
echo("<td>-</td>");
|
||||
}
|
||||
}
|
||||
echo('</tr>');
|
||||
}
|
||||
|
||||
echo('</table>');
|
||||
echo('</body></html>');
|
||||
|
||||
?>
|
0
preprocess-bpe-multi-target.sh
Normal file → Executable file
0
preprocess-bpe-multi-target.sh
Normal file → Executable file
0
preprocess-spm-multi-target.sh
Normal file → Executable file
0
preprocess-spm-multi-target.sh
Normal file → Executable file
@ -1,40 +1,49 @@
|
||||
ab de JW300.ab.de 1.4 0.148
|
||||
ab en JW300.ab.en 2.9 0.144
|
||||
ab en Tatoeba.ab.en 2.3 0.097
|
||||
ab fi JW300.ab.fi 1.5 0.147
|
||||
ab fr JW300.ab.fr 1.8 0.129
|
||||
ab sv JW300.ab.sv 2.4 0.147
|
||||
ach de JW300.ach.de 1.8 0.142
|
||||
ach en JW300.ach.en 5.4 0.207
|
||||
ach es JW300.ach.es 2.6 0.153
|
||||
ach fi JW300.ach.fi 1.7 0.163
|
||||
ach fr JW300.ach.fr 3.5 0.159
|
||||
ach sv JW300.ach.sv 2.7 0.160
|
||||
acu en bible-uedin.acu.en 3.8 0.202
|
||||
ada de JW300.ada.de 1.6 0.139
|
||||
ada en JW300.ada.en 4.3 0.182
|
||||
ada es JW300.ada.es 2.7 0.153
|
||||
ada fi JW300.ada.fi 1.7 0.154
|
||||
ada fr JW300.ada.fr 3.1 0.152
|
||||
ada sv JW300.ada.sv 2.1 0.146
|
||||
aed de JW300.aed.de 2.1 0.149
|
||||
aed en JW300.aed.en 4.0 0.177
|
||||
aed es JW300.aed.es 89.1 0.915
|
||||
aed fi JW300.aed.fi 2.2 0.163
|
||||
aed fr JW300.aed.fr 3.5 0.165
|
||||
aed sv JW300.aed.sv 3.3 0.170
|
||||
af de Tatoeba.af.de 48.6 0.681
|
||||
af en Tatoeba.af.en 60.8 0.736
|
||||
af es JW300.af.es 35.7 0.554
|
||||
af fi JW300.af.fi 32.3 0.576
|
||||
af fr JW300.af.fr 35.3 0.543
|
||||
af sv JW300.af.sv 40.4 0.599
|
||||
agr en bible-uedin.agr.en 4.5 0.222
|
||||
am de JW300.am.de 15.1 0.339
|
||||
am en GlobalVoices.am.en 6.1 0.286
|
||||
am en Tatoeba.am.en 63.8 0.744
|
||||
am es GlobalVoices.am.es 3.9 0.251
|
||||
am fi JW300.am.fi 18.1 0.394
|
||||
am fr GlobalVoices.am.fr 3.4 0.233
|
||||
am sv JW300.am.sv 21.0 0.377
|
||||
ar de Tatoeba.ar.de 43.0 0.614
|
||||
ar en Tatoeba.ar.en 49.4 0.661
|
||||
ar fi JW300.ar.fi 18.4 0.415
|
||||
ar fr Tatoeba.ar.fr 43.2 0.600
|
||||
ar sv GlobalVoices.ar.sv 12.9 0.386
|
||||
as de JW300.as.de 1.1 0.176
|
||||
ase de JW300.ase.de 27.2 0.478
|
||||
ase en JW300.ase.en 99.5 0.997
|
||||
ase fr JW300.ase.fr 37.8 0.553
|
||||
as en JW300.as.en 1.7 0.137
|
||||
@ -45,16 +54,19 @@ as fi JW300.as.fi 1.1 0.167
|
||||
as fr JW300.as.fr 1.4 0.154
|
||||
as sv JW300.as.sv 1.0 0.148
|
||||
ast en Tatoeba.ast.en 81.4 0.858
|
||||
ay de JW300.ay.de 5.0 0.191
|
||||
ay en JW300.ay.en 7.2 0.202
|
||||
ay es JW300.ay.es 11.3 0.265
|
||||
ay fi JW300.ay.fi 6.5 0.222
|
||||
ay fr JW300.ay.fr 6.4 0.203
|
||||
ay sv JW300.ay.sv 6.8 0.212
|
||||
ba de JW300.ba.de 1.4 0.146
|
||||
ba en JW300.ba.en 2.8 0.144
|
||||
ba en Tatoeba.ba.en 0.8 0.134
|
||||
ba es JW300.ba.es 2.0 0.141
|
||||
ba fi JW300.ba.fi 1.7 0.164
|
||||
ba fr JW300.ba.fr 3.1 0.150
|
||||
bas de JW300.bas.de 2.3 0.161
|
||||
bas en JW300.bas.en 5.8 0.207
|
||||
bas es JW300.bas.es 4.0 0.175
|
||||
bas fi JW300.bas.fi 2.4 0.174
|
||||
@ -64,16 +76,19 @@ ba sv JW300.ba.sv 2.2 0.139
|
||||
bbc en JW300.bbc.en 6.7 0.204
|
||||
bbc es JW300.bbc.es 4.4 0.178
|
||||
bbc fr JW300.bbc.fr 4.4 0.172
|
||||
bci de JW300.bci.de 5.0 0.215
|
||||
bci en JW300.bci.en 13.9 0.269
|
||||
bci es JW300.bci.es 5.9 0.223
|
||||
bci fi JW300.bci.fi 5.8 0.242
|
||||
bci fr JW300.bci.fr 6.9 0.216
|
||||
bci sv JW300.bci.sv 7.6 0.235
|
||||
bcl de JW300.bcl.de 30.3 0.510
|
||||
bcl en JW300.bcl.en 56.8 0.705
|
||||
bcl es JW300.bcl.es 37.0 0.551
|
||||
bcl fi JW300.bcl.fi 33.3 0.573
|
||||
bcl fr JW300.bcl.fr 35.0 0.527
|
||||
bcl sv JW300.bcl.sv 38.0 0.565
|
||||
bem de JW300.bem.de 18.9 0.379
|
||||
bem en JW300.bem.en 33.4 0.491
|
||||
bem es JW300.bem.es 22.8 0.403
|
||||
bem fi JW300.bem.fi 22.8 0.439
|
||||
@ -83,16 +98,19 @@ ber en Tatoeba.ber.en 37.3 0.566
|
||||
ber es Tatoeba.ber.es 33.8 0.487
|
||||
ber fr Tatoeba.ber.fr 60.2 0.754
|
||||
bfi en JW300.bfi.en 20.0 0.423
|
||||
bg de GlobalVoices.bg.de 19.9 0.484
|
||||
bg en Tatoeba.bg.en 59.4 0.727
|
||||
bg es GlobalVoices.bg.es 24.5 0.526
|
||||
bg fi JW300.bg.fi 23.7 0.505
|
||||
bg fr GlobalVoices.bg.fr 20.9 0.480
|
||||
bg sv JW300.bg.sv 29.1 0.494
|
||||
bhw en JW300.bhw.en 7.7 0.235
|
||||
bi de JW300.bi.de 15.9 0.355
|
||||
bi en JW300.bi.en 30.3 0.458
|
||||
bi fi JW300.bi.fi 0.6 0.124
|
||||
bi fr JW300.bi.fr 21.5 0.382
|
||||
bi sv JW300.bi.sv 22.7 0.403
|
||||
bn de GlobalVoices.bn.de 3.4 0.228
|
||||
bn en Tatoeba.bn.en 49.8 0.644
|
||||
bn es GlobalVoices.bn.es 12.7 0.372
|
||||
bn fi JW300.bn.fi 5.5 0.214
|
||||
@ -104,39 +122,47 @@ bs en GNOME.bs.en 71.9 0.789
|
||||
bs en Tatoeba.bs.en 64.9 0.784
|
||||
bsn en bible-uedin.bsn.en 1.2 0.117
|
||||
btx en JW300.btx.en 7.0 0.236
|
||||
bum de JW300.bum.de 1.9 0.154
|
||||
bum en JW300.bum.en 4.6 0.182
|
||||
bum es JW300.bum.es 3.2 0.162
|
||||
bum fi JW300.bum.fi 2.2 0.161
|
||||
bum fr JW300.bum.fr 4.0 0.173
|
||||
bum sv JW300.bum.sv 3.5 0.163
|
||||
bzs de JW300.bzs.de 19.3 0.385
|
||||
bzs en JW300.bzs.en 44.5 0.605
|
||||
bzs es JW300.bzs.es 28.1 0.464
|
||||
bzs fi JW300.bzs.fi 24.7 0.464
|
||||
bzs fr JW300.bzs.fr 30.0 0.479
|
||||
bzs sv JW300.bzs.sv 30.7 0.489
|
||||
cab de JW300.cab.de 1.8 0.134
|
||||
cab en JW300.cab.en 3.0 0.154
|
||||
cab es JW300.cab.es 5.1 0.225
|
||||
cab fi JW300.cab.fi 1.7 0.150
|
||||
cab fr JW300.cab.fr 3.1 0.153
|
||||
cab sv JW300.cab.sv 2.6 0.152
|
||||
ca de Tatoeba.ca.de 36.2 0.569
|
||||
ca en Tatoeba.ca.en 51.4 0.678
|
||||
ca es Tatoeba.ca.es 74.9 0.863
|
||||
ca fr Tatoeba.ca.fr 50.4 0.672
|
||||
cak de JW300.cak.de 0.7 0.077
|
||||
cak en JW300.cak.en 2.6 0.140
|
||||
cak fi JW300.cak.fi 0.6 0.109
|
||||
cak fr JW300.cak.fr 2.2 0.132
|
||||
cak sv JW300.cak.sv 0.6 0.084
|
||||
ca sv GlobalVoices.ca.sv 11.2 0.366
|
||||
cat de JW300.cat.de 1.4 0.143
|
||||
cat en JW300.cat.en 3.3 0.171
|
||||
cat fi JW300.cat.fi 1.6 0.155
|
||||
cat fr JW300.cat.fr 3.5 0.163
|
||||
cat sv JW300.cat.sv 2.5 0.154
|
||||
ceb de Tatoeba.ceb.de 9.7 0.312
|
||||
ceb en JW300.ceb.en 52.6 0.670
|
||||
ceb en Tatoeba.ceb.en 59.5 0.704
|
||||
ceb es JW300.ceb.es 31.6 0.508
|
||||
ceb fi JW300.ceb.fi 27.4 0.525
|
||||
ceb fr JW300.ceb.fr 30.0 0.491
|
||||
ceb sv JW300.ceb.sv 35.5 0.552
|
||||
chk de JW300.chk.de 17.0 0.350
|
||||
chk en JW300.chk.en 31.2 0.465
|
||||
chk es JW300.chk.es 20.8 0.374
|
||||
chk fi JW300.chk.fi 19.4 0.395
|
||||
@ -145,20 +171,23 @@ chk sv JW300.chk.sv 23.6 0.406
|
||||
cjk en JW300.cjk.en 6.8 0.226
|
||||
cjk es JW300.cjk.es 3.8 0.169
|
||||
cjk fr JW300.cjk.fr 4.3 0.174
|
||||
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh de Tatoeba.cmn.de 33.1 0.530
|
||||
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh de Tatoeba.cmn.de 33.4 0.534
|
||||
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh fi bible-uedin.cmn.fi 21.6 0.497
|
||||
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh sv Tatoeba.cmn.sv 46.6 0.620
|
||||
cnh en JW300.cnh.en 6.9 0.240
|
||||
crp de bible-uedin.crp.de 2.5 0.190
|
||||
crp es bible-uedin.crp.es 2.8 0.187
|
||||
crp fi bible-uedin.crp.fi 2.0 0.181
|
||||
crp fr bible-uedin.crp.fr 2.9 0.190
|
||||
crp sv bible-uedin.crp.sv 3.1 0.184
|
||||
crs de JW300.crs.de 20.4 0.397
|
||||
crs en JW300.crs.en 42.9 0.589
|
||||
crs es JW300.crs.es 26.1 0.445
|
||||
crs fi JW300.crs.fi 25.6 0.479
|
||||
crs fr JW300.crs.fr 29.4 0.475
|
||||
crs sv JW300.crs.sv 29.3 0.480
|
||||
csb en Tatoeba.csb.en 0.1 0.049
|
||||
cs de Tatoeba.cs.de 51.6 0.687
|
||||
cs en newstest2014-csen.cs.en 34.1 0.612
|
||||
cs en newstest2015-encs.cs.en 30.4 0.565
|
||||
cs en newstest2016-encs.cs.en 31.8 0.584
|
||||
@ -167,16 +196,19 @@ cs en newstest2018-encs.cs.en 30.3 0.566
|
||||
cs en Tatoeba.cs.en 58.0 0.721
|
||||
cs fi JW300.cs.fi 25.5 0.523
|
||||
cs fr GlobalVoices.cs.fr 21.0 0.488
|
||||
csg de JW300.csg.de 2.8 0.162
|
||||
csg en JW300.csg.en 4.6 0.183
|
||||
csg es JW300.csg.es 93.1 0.952
|
||||
csg fi JW300.csg.fi 2.3 0.160
|
||||
csg fr JW300.csg.fr 4.7 0.184
|
||||
csg sv JW300.csg.sv 4.5 0.176
|
||||
csl de JW300.csl.de 1.7 0.147
|
||||
csl en JW300.csl.en 4.1 0.162
|
||||
csl es JW300.csl.es 3.1 0.141
|
||||
csl fi JW300.csl.fi 2.5 0.152
|
||||
csl fr JW300.csl.fr 3.0 0.156
|
||||
csl sv JW300.csl.sv 3.3 0.142
|
||||
csn de JW300.csn.de 1.9 0.145
|
||||
csn en JW300.csn.en 3.8 0.172
|
||||
csn es JW300.csn.es 87.4 0.899
|
||||
csn fi JW300.csn.fi 2.0 0.162
|
||||
@ -185,26 +217,68 @@ csn sv JW300.csn.sv 3.8 0.173
|
||||
cs sv JW300.cs.sv 30.6 0.527
|
||||
ctu en JW300.ctu.en 2.9 0.157
|
||||
ctu fr JW300.ctu.fr 3.3 0.166
|
||||
cv de JW300.cv.de 1.4 0.148
|
||||
cv en JW300.cv.en 2.6 0.151
|
||||
cv en Tatoeba.cv.en 0.3 0.102
|
||||
cv es JW300.cv.es 2.0 0.152
|
||||
cv fi JW300.cv.fi 1.2 0.148
|
||||
cv fr JW300.cv.fr 2.6 0.154
|
||||
cv sv JW300.cv.sv 2.1 0.144
|
||||
cy de JW300.cy.de 4.7 0.200
|
||||
cy en Tatoeba.cy.en 33.0 0.525
|
||||
cy es JW300.cy.es 0.0 0.025
|
||||
cy fi JW300.cy.fi 0.3 0.067
|
||||
cy fr JW300.cy.fr 8.7 0.266
|
||||
cy sv JW300.cy.sv 6.6 0.218
|
||||
da de Tatoeba.da.de 57.4 0.741
|
||||
da en Tatoeba.da.en 63.6 0.769
|
||||
da es Tatoeba.da.es 53.7 0.715
|
||||
da fi Tatoeba.da.fi 39.0 0.629
|
||||
da+fo+is+no+nb+nn+sv da+fo+is+no+nb+nn+sv Tatoeba.da.sv 69.2 0.811
|
||||
da fr Tatoeba.da.fr 62.2 0.751
|
||||
de ab JW300.de.ab 1.0 0.124
|
||||
de ach JW300.de.ach 3.6 0.173
|
||||
de ada JW300.de.ada 6.5 0.196
|
||||
de aed JW300.de.aed 3.1 0.150
|
||||
de af Tatoeba.de.af 49.9 0.703
|
||||
de am JW300.de.am 12.2 0.252
|
||||
de ar Tatoeba.de.ar 14.7 0.456
|
||||
de ase JW300.de.ase 30.4 0.483
|
||||
de as JW300.de.as 1.4 0.122
|
||||
de ay JW300.de.ay 5.2 0.239
|
||||
de az_IR+az JW300.de.az 13.4 0.342
|
||||
de ba JW300.de.ba 1.9 0.132
|
||||
de bas JW300.de.bas 4.2 0.167
|
||||
de bci JW300.de.bci 9.4 0.248
|
||||
de bcl JW300.de.bcl 34.6 0.563
|
||||
de bem JW300.de.bem 19.2 0.434
|
||||
de be_tarask+be Tatoeba.de.be 3.1 0.106
|
||||
de cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh bible-uedin.de.zh 21.9 0.293
|
||||
de de Tatoeba.de.de 40.8 0.617
|
||||
de bg GlobalVoices.de.bg 19.4 0.463
|
||||
de bi JW300.de.bi 25.7 0.450
|
||||
de bn GlobalVoices.de.bn 1.3 0.182
|
||||
de bum JW300.de.bum 5.0 0.182
|
||||
de bzs JW300.de.bzs 21.0 0.389
|
||||
de cab JW300.de.cab 2.7 0.176
|
||||
de cak JW300.de.cak 0.8 0.116
|
||||
de ca Tatoeba.de.ca 34.0 0.552
|
||||
de cat JW300.de.cat 3.0 0.157
|
||||
de ceb Tatoeba.de.ceb 8.9 0.412
|
||||
de chk JW300.de.chk 15.9 0.364
|
||||
de cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh bible-uedin.de.zh 24.4 0.335
|
||||
de crp bible-uedin.de.crp 4.4 0.202
|
||||
de crs JW300.de.crs 24.1 0.429
|
||||
de csg JW300.de.csg 3.7 0.169
|
||||
de csl JW300.de.csl 2.2 0.044
|
||||
de csn JW300.de.csn 2.7 0.154
|
||||
de cs Tatoeba.de.cs 42.2 0.625
|
||||
de cv JW300.de.cv 2.0 0.144
|
||||
de cy JW300.de.cy 4.1 0.187
|
||||
de de Tatoeba.de.de 40.7 0.616
|
||||
de dhv JW300.de.dhv 5.6 0.241
|
||||
de dje bible-uedin.de.dje 4.9 0.223
|
||||
de ee JW300.de.ee 24.6 0.463
|
||||
de efi JW300.de.efi 24.2 0.451
|
||||
de el Tatoeba.de.el 45.7 0.649
|
||||
de en newssyscomb2009.de.en 28.6 0.553
|
||||
de en news-test2008.de.en 27.6 0.547
|
||||
de en newstest2009.de.en 26.9 0.544
|
||||
@ -219,10 +293,16 @@ de en newstest2017-ende.de.en 35.6 0.609
|
||||
de en newstest2018-ende.de.en 43.8 0.667
|
||||
de en newstest2019-deen.de.en 39.6 0.637
|
||||
de en Tatoeba.de.en 55.1 0.704
|
||||
de eo Tatoeba.de.eo 48.6 0.673
|
||||
de es Tatoeba.de.es 48.5 0.676
|
||||
de et JW300.de.et 20.2 0.465
|
||||
de eu bible-uedin.de.eu 0.3 0.132
|
||||
de fa GlobalVoices.de.fa 4.8 0.262
|
||||
de fi goethe-institute-test1.de.fi 18.3 0.493
|
||||
de fi goethe-institute-test2.de.fi 18.0 0.494
|
||||
de fi Tatoeba.de.fi 40.0 0.628
|
||||
de fj JW300.de.fj 24.6 0.470
|
||||
de fon JW300.de.fon 4.5 0.156
|
||||
de fr euelections_dev2019.transformer-align.de 32.2 0.590
|
||||
de fr newssyscomb2009.de.fr 26.8 0.553
|
||||
de fr news-test2008.de.fr 26.4 0.548
|
||||
@ -233,6 +313,25 @@ de fr newstest2012.de.fr 27.7 0.554
|
||||
de fr newstest2013.de.fr 29.5 0.560
|
||||
de fr newstest2019-defr.de.fr 36.6 0.625
|
||||
de fr Tatoeba.de.fr 49.2 0.664
|
||||
de fse JW300.de.fse 3.2 0.180
|
||||
de gaa JW300.de.gaa 26.3 0.471
|
||||
de gd bible-uedin.de.gd 0.0 0.095
|
||||
de gil JW300.de.gil 24.0 0.472
|
||||
de guc JW300.de.guc 2.1 0.194
|
||||
de gug JW300.de.gug 7.2 0.241
|
||||
de gu JW300.de.gu 2.7 0.129
|
||||
de guw JW300.de.guw 27.1 0.472
|
||||
de gv bible-uedin.de.gv 0.0 0.028
|
||||
de gym JW300.de.gym 3.4 0.218
|
||||
de ha JW300.de.ha 20.7 0.417
|
||||
de hi JW300.de.hi 4.2 0.162
|
||||
de hil JW300.de.hil 33.9 0.563
|
||||
de ho JW300.de.ho 22.6 0.461
|
||||
de hsb Tatoeba.de.hsb 0.1 0.042
|
||||
de ht JW300.de.ht 21.8 0.390
|
||||
de hu Tatoeba.de.hu 34.3 0.588
|
||||
de hy JW300.de.hy 9.9 0.274
|
||||
de ia Tatoeba.de.ia 0.2 0.088
|
||||
de+nl+fy+af+da+fo+is+no+nb+nn+sv de+nl+fy+af+da+fo+is+no+nb+nn+sv Tatoeba.de.sv 48.1 0.663
|
||||
de pt_br+pt_BR+pt_PT+pt Tatoeba.de.pt 35.2 0.577
|
||||
dhv en JW300.dhv.en 4.7 0.190
|
||||
|
Loading…
Reference in New Issue
Block a user