train with backtranslations

This commit is contained in:
Joerg Tiedemann 2020-01-18 20:37:01 +02:00
parent 0185534823
commit 596cae8922
10 changed files with 448 additions and 218 deletions

218
Makefile
View File

@ -91,6 +91,7 @@ include Makefile.dist
include Makefile.tasks
include Makefile.data
include Makefile.doclevel
include Makefile.generic
include Makefile.slurm
@ -133,17 +134,6 @@ translate-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${
eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
## resume training on an existing model
resume:
if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \
cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \
${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \
fi
sleep 1
rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} train
#------------------------------------------------------------------------
# translate and evaluate all test sets in testsets/
#------------------------------------------------------------------------
@ -152,7 +142,6 @@ resume:
## and all trokenized test sets that can be found in that directory
TESTSET_HOME = ${PWD}/testsets
TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG}
# TESTSETS = $(patsubst ${TESTSET_DIR}/%.${SRC}.tok.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.tok.gz})
TESTSETS = $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})
TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})})
TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})})
@ -190,197 +179,17 @@ finished:
fi
## extension -all: run something over all language pairs, e.g.
## make wordalign-all
## this goes sequentially over all language pairs
## for the parallelizable version of this: look at %-all-parallel
%-all:
for l in ${ALL_LANG_PAIRS}; do \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \
done
# run something over all language pairs that have trained models
## - make eval-allmodels
## - make dist-allmodels
%-allmodels:
for l in ${ALL_LANG_PAIRS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \
fi \
done
## only bilingual models
%-allbilingual:
for l in ${ALL_BILINGUAL_MODELS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \
fi \
done
## only bilingual models
%-allmultilingual:
for l in ${ALL_MULTILINGUAL_MODELS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \
fi \
done
## run something over all language pairs but make it possible to do it in parallel, for example
## - make dist-all-parallel
%-all-parallel:
${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}})
## run a command that includes the langpair, for example
## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv"
## What is this good for?
## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially
%-run-for-langpair:
${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
${shell echo $@ | sed 's/__.*$$//'}
## right-to-left model
%-RL:
${MAKE} MODEL=${MODEL}-RL \
MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \
${@:-RL=}
## run a multigpu job (2 or 4 GPUs)
%-multigpu %-0123:
${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=})
%-twogpu %-gpu01:
${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=})
%-gpu23:
${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=}
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
%-cpu:
${MAKE} MARIAN=${MARIANCPU} \
LOADMODS='${LOADCPU}' \
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
${@:-cpu=}
## document level models
%-doc:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
${@:-doc=}
## sentence-piece models
%-spm:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=}
%-spm-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
MODELTYPE=transformer \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm-noalign=}
## BPE models
%-bpe:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe=}
%-bpe-align:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \
PRE=tok \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-align=}
%-bpe-memad:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-memad=}
%-bpe-old:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-old=}
## for the inbuilt sentence-piece segmentation:
# PRE_SRC=txt PRE_TRG=txt
# MARIAN=${MARIAN}-spm
# MODEL_VOCABTYPE=spm
## continue document-level training with a new context size
ifndef NEW_CONTEXT
NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE)))
endif
continue-doctrain:
mkdir -p ${WORKDIR}/${MODEL}
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}})
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz
${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc
## continue training with a new dataset
ifndef NEW_DATASET
NEW_DATASET = OpenSubtitles
endif
continue-datatrain:
mkdir -p ${WORKDIR}/${MODEL}
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}})
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz
if [ -e ${BPESRCMODEL} ]; then \
cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \
cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \
## resume training on an existing model
resume:
if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \
cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \
${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \
fi
if [ -e ${SPMSRCMODEL} ]; then \
cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \
cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \
fi
${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train
sleep 1
rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} train
# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus"
@ -548,13 +357,6 @@ endif
rm -f $@.input $@.output
# %.eval: % ${TEST_TRG}
# grep . ${TEST_TRG} > $@.ref
# grep . $< > $@.sys
# cat $@.sys | sacrebleu $@.ref > $@
# cat $@.sys | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
# rm -f $@.ref $@.sys
%.eval: % ${TEST_TRG}
paste ${TEST_SRC}.${PRE_SRC} ${TEST_TRG} | grep $$'.\t' | cut -f2 > $@.ref
@ -575,5 +377,3 @@ endif
-e 's/&amp;/&/g' |\
sed 'n;n;G;' > $@
rm -f $@.1 $@.2 $@.3
# paste -d "\n" ${TEST_SRC} ${TEST_TRG} ${<:.eval=} |\

View File

@ -215,6 +215,11 @@ MODEL_VOCABTYPE = yml
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
## OPUS model (in case we want to continue training with other data)
OPUSMODEL = ${MODEL_SUBDIR}opus${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
OPUSMODEL_BASE = ${OPUSMODEL}.${MODELTYPE}.model${NR}
OPUSMODEL_FINAL = ${WORKDIR}/${OPUSMODEL_BASE}.npz.best-perplexity.npz
## test set translation and scores

View File

@ -31,6 +31,10 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRC}.gz}}
BACKTRANS_TRG = ${patsubst %.${SRC}.gz,%.${TRG}.gz,${BACKTRANS_SRC}}
## make data in reverse direction without re-doing word alignment etc ...
## ---> this is dangerous when things run in parallel

View File

@ -125,12 +125,6 @@ endif
endif
ttt:
@echo ${PREPROCESS_SRCMODEL}
@echo ${PREPROCESS_TRGMODEL}
@echo ${PREPROCESS_SCRIPT}
@echo ${POSTPROCESS_SCRIPT}
${DIST_PACKAGE}: ${MODEL_FINAL}
ifneq (${SKIP_DIST_EVAL},1)

View File

@ -4,6 +4,49 @@
DOCLEVEL_BENCHMARK_DATA = https://zenodo.org/record/3525366/files/doclevel-MT-benchmark-discomt2019.zip
## continue document-level training with a new context size
ifndef NEW_CONTEXT
NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE)))
endif
continue-doctrain:
mkdir -p ${WORKDIR}/${MODEL}
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}})
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz
${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc
## continue training with a new dataset
ifndef NEW_DATASET
NEW_DATASET = OpenSubtitles
endif
continue-datatrain:
mkdir -p ${WORKDIR}/${MODEL}
cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}})
cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz
if [ -e ${BPESRCMODEL} ]; then \
cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \
cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \
fi
if [ -e ${SPMSRCMODEL} ]; then \
cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \
cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \
fi
${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train
# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus"
## use the doclevel benchmark data sets
%-ost:
${MAKE} ost-datasets

174
Makefile.generic Normal file
View File

@ -0,0 +1,174 @@
# -*-makefile-*-
#
# generic implic targets that make our life a bit easier
## extension -all: run something over all language pairs, e.g.
## make wordalign-all
## this goes sequentially over all language pairs
## for the parallelizable version of this: look at %-all-parallel
%-all:
for l in ${ALL_LANG_PAIRS}; do \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \
done
# run something over all language pairs that have trained models
## - make eval-allmodels
## - make dist-allmodels
%-allmodels:
for l in ${ALL_LANG_PAIRS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \
fi \
done
## only bilingual models
%-allbilingual:
for l in ${ALL_BILINGUAL_MODELS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \
fi \
done
## only bilingual models
%-allmultilingual:
for l in ${ALL_MULTILINGUAL_MODELS}; do \
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \
fi \
done
## run something over all language pairs but make it possible to do it in parallel, for example
## - make dist-all-parallel
%-all-parallel:
${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}})
## run a command that includes the langpair, for example
## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv"
## What is this good for?
## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially
%-run-for-langpair:
${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
${shell echo $@ | sed 's/__.*$$//'}
## right-to-left model
%-RL:
${MAKE} MODEL=${MODEL}-RL \
MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \
${@:-RL=}
## include all backtranslation data as well in training
## start from the pre-trained opus model if it exists
%-add-backtranslations:
ifneq (${wildcard ${OPUSMODEL_FINAL}},)
cp ${OPUSMODEL_FINAL} ${MODEL_BASENAME}.gz
endif
${MAKE} DATASET=opus+bt \
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
${@:-add-backtranslations=}
## run a multigpu job (2 or 4 GPUs)
%-multigpu %-0123:
${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=})
%-twogpu %-gpu01:
${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=})
%-gpu23:
${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=}
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
%-cpu:
${MAKE} MARIAN=${MARIANCPU} \
LOADMODS='${LOADCPU}' \
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
${@:-cpu=}
## document level models
%-doc:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
${@:-doc=}
## sentence-piece models
%-spm:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=}
%-spm-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
MODELTYPE=transformer \
PRE=norm \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm-noalign=}
## BPE models
%-bpe:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe=}
%-bpe-align:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \
PRE=tok \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-align=}
%-bpe-memad:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-memad=}
%-bpe-old:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \
PRE=tok \
MODELTYPE=transformer \
PRE_SRC=bpe${SRCBPESIZE:000=}k \
PRE_TRG=bpe${TRGBPESIZE:000=}k \
${@:-bpe-old=}
## for the inbuilt sentence-piece segmentation:
# PRE_SRC=txt PRE_TRG=txt
# MARIAN=${MARIAN}-spm
# MODEL_VOCABTYPE=spm

111
html/index.php Normal file
View File

@ -0,0 +1,111 @@
<?php
$lines = file("https://object.pouta.csc.fi/OPUS-MT-models/index.txt");
foreach ($lines as $line){
$line = rtrim($line);
if (substr($line, -4) === '.zip'){
$parts = explode('/',$line);
$langs = explode('-',$parts[0]);
if (strpos($langs[0],'+') || strpos($langs[1],'+')){
$multilingual["$langs[0]-$langs[1]"]=1;
$src = explode('+',$langs[0]);
$trg = explode('+',$langs[1]);
foreach ($src as $s){
foreach ($trg as $t){
if (!array_key_exists("$s$t",$models)){
$models["$s$t"] = "$langs[0]-$langs[1]";
$nrlangpairs++;
}
$nrmultipairs++;
$srclangs[$s]=1;
$trglangs[$t]=1;
$languages[$s]=1;
$languages[$t]=1;
}
}
}
else{
if (!array_key_exists("$langs[0]$langs[1]",$models)){
$nrlangpairs++;
}
$bilingual["$langs[0]-$langs[1]"]=1;
$models["$langs[0]$langs[1]"] = "$langs[0]-$langs[1]";
// $models["$langs[0]$langs[1]"] = $line;
$srclangs[$langs[0]]=1;
$trglangs[$langs[1]]=1;
$languages[$langs[0]]=1;
$languages[$langs[1]]=1;
$nrmodels++;
}
}
}
// ksort($languages);
ksort($srclangs);
ksort($trglangs);
ksort($multilingual);
echo("<html><head></head><body>");
echo("<h1>Pre-trained Opus-MT Models</h1><ul>");
// echo("<li>Number of bilingual models: $nrmodels</li>");
echo("<li>Number of bilingual models: ");
echo(count($bilingual));
echo("</li>");
echo("<li>Number of multilingual models: ");
echo(count($multilingual));
echo("</li>");
echo("<li>Number of supported source languages: ");
echo(count($srclangs));
echo("</li>");
echo("<li>Number of supported target languages: ");
echo(count($trglangs));
echo("</li>");
echo("<li>Number of supported language pairs: $nrlangpairs</li>");
echo("<li>Language pairs supported by multilingual models: $nrmultipairs</li>");
echo('</ul>');
echo("<h2>Multilingual models</h2><ul>");
foreach ($multilingual as $model => $c){
echo("<li><a href=\"https://github.com/Helsinki-NLP/OPUS-MT-train/tree/master/models/$model\">$model</a></li>");
}
echo('</ul>');
echo("<h2>Language pairs</h2><ul>");
echo('<table><tr><th></th>');
foreach ($trglangs as $language => $count){
echo '<th>';
echo $language;
echo '</th>';
}
echo('</tr>');
foreach ($srclangs as $src => $count){
echo "<tr><td>$src</td>";
foreach ($trglangs as $trg => $count){
if (array_key_exists("$src$trg",$models)){
echo("<td><a href=\"https://github.com/Helsinki-NLP/OPUS-MT-train/tree/master/models/");
echo($models["$src$trg"]);
if ($models["$src$trg"] != "$src-$trg"){
echo("\">multi</a></td>\n");
}
else{
echo("\">$src$trg</a></td>\n");
}
}
else{
echo("<td>-</td>");
}
}
echo('</tr>');
}
echo('</table>');
echo('</body></html>');
?>

0
preprocess-bpe-multi-target.sh Normal file → Executable file
View File

0
preprocess-spm-multi-target.sh Normal file → Executable file
View File

View File

@ -1,40 +1,49 @@
ab de JW300.ab.de 1.4 0.148
ab en JW300.ab.en 2.9 0.144
ab en Tatoeba.ab.en 2.3 0.097
ab fi JW300.ab.fi 1.5 0.147
ab fr JW300.ab.fr 1.8 0.129
ab sv JW300.ab.sv 2.4 0.147
ach de JW300.ach.de 1.8 0.142
ach en JW300.ach.en 5.4 0.207
ach es JW300.ach.es 2.6 0.153
ach fi JW300.ach.fi 1.7 0.163
ach fr JW300.ach.fr 3.5 0.159
ach sv JW300.ach.sv 2.7 0.160
acu en bible-uedin.acu.en 3.8 0.202
ada de JW300.ada.de 1.6 0.139
ada en JW300.ada.en 4.3 0.182
ada es JW300.ada.es 2.7 0.153
ada fi JW300.ada.fi 1.7 0.154
ada fr JW300.ada.fr 3.1 0.152
ada sv JW300.ada.sv 2.1 0.146
aed de JW300.aed.de 2.1 0.149
aed en JW300.aed.en 4.0 0.177
aed es JW300.aed.es 89.1 0.915
aed fi JW300.aed.fi 2.2 0.163
aed fr JW300.aed.fr 3.5 0.165
aed sv JW300.aed.sv 3.3 0.170
af de Tatoeba.af.de 48.6 0.681
af en Tatoeba.af.en 60.8 0.736
af es JW300.af.es 35.7 0.554
af fi JW300.af.fi 32.3 0.576
af fr JW300.af.fr 35.3 0.543
af sv JW300.af.sv 40.4 0.599
agr en bible-uedin.agr.en 4.5 0.222
am de JW300.am.de 15.1 0.339
am en GlobalVoices.am.en 6.1 0.286
am en Tatoeba.am.en 63.8 0.744
am es GlobalVoices.am.es 3.9 0.251
am fi JW300.am.fi 18.1 0.394
am fr GlobalVoices.am.fr 3.4 0.233
am sv JW300.am.sv 21.0 0.377
ar de Tatoeba.ar.de 43.0 0.614
ar en Tatoeba.ar.en 49.4 0.661
ar fi JW300.ar.fi 18.4 0.415
ar fr Tatoeba.ar.fr 43.2 0.600
ar sv GlobalVoices.ar.sv 12.9 0.386
as de JW300.as.de 1.1 0.176
ase de JW300.ase.de 27.2 0.478
ase en JW300.ase.en 99.5 0.997
ase fr JW300.ase.fr 37.8 0.553
as en JW300.as.en 1.7 0.137
@ -45,16 +54,19 @@ as fi JW300.as.fi 1.1 0.167
as fr JW300.as.fr 1.4 0.154
as sv JW300.as.sv 1.0 0.148
ast en Tatoeba.ast.en 81.4 0.858
ay de JW300.ay.de 5.0 0.191
ay en JW300.ay.en 7.2 0.202
ay es JW300.ay.es 11.3 0.265
ay fi JW300.ay.fi 6.5 0.222
ay fr JW300.ay.fr 6.4 0.203
ay sv JW300.ay.sv 6.8 0.212
ba de JW300.ba.de 1.4 0.146
ba en JW300.ba.en 2.8 0.144
ba en Tatoeba.ba.en 0.8 0.134
ba es JW300.ba.es 2.0 0.141
ba fi JW300.ba.fi 1.7 0.164
ba fr JW300.ba.fr 3.1 0.150
bas de JW300.bas.de 2.3 0.161
bas en JW300.bas.en 5.8 0.207
bas es JW300.bas.es 4.0 0.175
bas fi JW300.bas.fi 2.4 0.174
@ -64,16 +76,19 @@ ba sv JW300.ba.sv 2.2 0.139
bbc en JW300.bbc.en 6.7 0.204
bbc es JW300.bbc.es 4.4 0.178
bbc fr JW300.bbc.fr 4.4 0.172
bci de JW300.bci.de 5.0 0.215
bci en JW300.bci.en 13.9 0.269
bci es JW300.bci.es 5.9 0.223
bci fi JW300.bci.fi 5.8 0.242
bci fr JW300.bci.fr 6.9 0.216
bci sv JW300.bci.sv 7.6 0.235
bcl de JW300.bcl.de 30.3 0.510
bcl en JW300.bcl.en 56.8 0.705
bcl es JW300.bcl.es 37.0 0.551
bcl fi JW300.bcl.fi 33.3 0.573
bcl fr JW300.bcl.fr 35.0 0.527
bcl sv JW300.bcl.sv 38.0 0.565
bem de JW300.bem.de 18.9 0.379
bem en JW300.bem.en 33.4 0.491
bem es JW300.bem.es 22.8 0.403
bem fi JW300.bem.fi 22.8 0.439
@ -83,16 +98,19 @@ ber en Tatoeba.ber.en 37.3 0.566
ber es Tatoeba.ber.es 33.8 0.487
ber fr Tatoeba.ber.fr 60.2 0.754
bfi en JW300.bfi.en 20.0 0.423
bg de GlobalVoices.bg.de 19.9 0.484
bg en Tatoeba.bg.en 59.4 0.727
bg es GlobalVoices.bg.es 24.5 0.526
bg fi JW300.bg.fi 23.7 0.505
bg fr GlobalVoices.bg.fr 20.9 0.480
bg sv JW300.bg.sv 29.1 0.494
bhw en JW300.bhw.en 7.7 0.235
bi de JW300.bi.de 15.9 0.355
bi en JW300.bi.en 30.3 0.458
bi fi JW300.bi.fi 0.6 0.124
bi fr JW300.bi.fr 21.5 0.382
bi sv JW300.bi.sv 22.7 0.403
bn de GlobalVoices.bn.de 3.4 0.228
bn en Tatoeba.bn.en 49.8 0.644
bn es GlobalVoices.bn.es 12.7 0.372
bn fi JW300.bn.fi 5.5 0.214
@ -104,39 +122,47 @@ bs en GNOME.bs.en 71.9 0.789
bs en Tatoeba.bs.en 64.9 0.784
bsn en bible-uedin.bsn.en 1.2 0.117
btx en JW300.btx.en 7.0 0.236
bum de JW300.bum.de 1.9 0.154
bum en JW300.bum.en 4.6 0.182
bum es JW300.bum.es 3.2 0.162
bum fi JW300.bum.fi 2.2 0.161
bum fr JW300.bum.fr 4.0 0.173
bum sv JW300.bum.sv 3.5 0.163
bzs de JW300.bzs.de 19.3 0.385
bzs en JW300.bzs.en 44.5 0.605
bzs es JW300.bzs.es 28.1 0.464
bzs fi JW300.bzs.fi 24.7 0.464
bzs fr JW300.bzs.fr 30.0 0.479
bzs sv JW300.bzs.sv 30.7 0.489
cab de JW300.cab.de 1.8 0.134
cab en JW300.cab.en 3.0 0.154
cab es JW300.cab.es 5.1 0.225
cab fi JW300.cab.fi 1.7 0.150
cab fr JW300.cab.fr 3.1 0.153
cab sv JW300.cab.sv 2.6 0.152
ca de Tatoeba.ca.de 36.2 0.569
ca en Tatoeba.ca.en 51.4 0.678
ca es Tatoeba.ca.es 74.9 0.863
ca fr Tatoeba.ca.fr 50.4 0.672
cak de JW300.cak.de 0.7 0.077
cak en JW300.cak.en 2.6 0.140
cak fi JW300.cak.fi 0.6 0.109
cak fr JW300.cak.fr 2.2 0.132
cak sv JW300.cak.sv 0.6 0.084
ca sv GlobalVoices.ca.sv 11.2 0.366
cat de JW300.cat.de 1.4 0.143
cat en JW300.cat.en 3.3 0.171
cat fi JW300.cat.fi 1.6 0.155
cat fr JW300.cat.fr 3.5 0.163
cat sv JW300.cat.sv 2.5 0.154
ceb de Tatoeba.ceb.de 9.7 0.312
ceb en JW300.ceb.en 52.6 0.670
ceb en Tatoeba.ceb.en 59.5 0.704
ceb es JW300.ceb.es 31.6 0.508
ceb fi JW300.ceb.fi 27.4 0.525
ceb fr JW300.ceb.fr 30.0 0.491
ceb sv JW300.ceb.sv 35.5 0.552
chk de JW300.chk.de 17.0 0.350
chk en JW300.chk.en 31.2 0.465
chk es JW300.chk.es 20.8 0.374
chk fi JW300.chk.fi 19.4 0.395
@ -145,20 +171,23 @@ chk sv JW300.chk.sv 23.6 0.406
cjk en JW300.cjk.en 6.8 0.226
cjk es JW300.cjk.es 3.8 0.169
cjk fr JW300.cjk.fr 4.3 0.174
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh de Tatoeba.cmn.de 33.1 0.530
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh de Tatoeba.cmn.de 33.4 0.534
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh fi bible-uedin.cmn.fi 21.6 0.497
cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh sv Tatoeba.cmn.sv 46.6 0.620
cnh en JW300.cnh.en 6.9 0.240
crp de bible-uedin.crp.de 2.5 0.190
crp es bible-uedin.crp.es 2.8 0.187
crp fi bible-uedin.crp.fi 2.0 0.181
crp fr bible-uedin.crp.fr 2.9 0.190
crp sv bible-uedin.crp.sv 3.1 0.184
crs de JW300.crs.de 20.4 0.397
crs en JW300.crs.en 42.9 0.589
crs es JW300.crs.es 26.1 0.445
crs fi JW300.crs.fi 25.6 0.479
crs fr JW300.crs.fr 29.4 0.475
crs sv JW300.crs.sv 29.3 0.480
csb en Tatoeba.csb.en 0.1 0.049
cs de Tatoeba.cs.de 51.6 0.687
cs en newstest2014-csen.cs.en 34.1 0.612
cs en newstest2015-encs.cs.en 30.4 0.565
cs en newstest2016-encs.cs.en 31.8 0.584
@ -167,16 +196,19 @@ cs en newstest2018-encs.cs.en 30.3 0.566
cs en Tatoeba.cs.en 58.0 0.721
cs fi JW300.cs.fi 25.5 0.523
cs fr GlobalVoices.cs.fr 21.0 0.488
csg de JW300.csg.de 2.8 0.162
csg en JW300.csg.en 4.6 0.183
csg es JW300.csg.es 93.1 0.952
csg fi JW300.csg.fi 2.3 0.160
csg fr JW300.csg.fr 4.7 0.184
csg sv JW300.csg.sv 4.5 0.176
csl de JW300.csl.de 1.7 0.147
csl en JW300.csl.en 4.1 0.162
csl es JW300.csl.es 3.1 0.141
csl fi JW300.csl.fi 2.5 0.152
csl fr JW300.csl.fr 3.0 0.156
csl sv JW300.csl.sv 3.3 0.142
csn de JW300.csn.de 1.9 0.145
csn en JW300.csn.en 3.8 0.172
csn es JW300.csn.es 87.4 0.899
csn fi JW300.csn.fi 2.0 0.162
@ -185,26 +217,68 @@ csn sv JW300.csn.sv 3.8 0.173
cs sv JW300.cs.sv 30.6 0.527
ctu en JW300.ctu.en 2.9 0.157
ctu fr JW300.ctu.fr 3.3 0.166
cv de JW300.cv.de 1.4 0.148
cv en JW300.cv.en 2.6 0.151
cv en Tatoeba.cv.en 0.3 0.102
cv es JW300.cv.es 2.0 0.152
cv fi JW300.cv.fi 1.2 0.148
cv fr JW300.cv.fr 2.6 0.154
cv sv JW300.cv.sv 2.1 0.144
cy de JW300.cy.de 4.7 0.200
cy en Tatoeba.cy.en 33.0 0.525
cy es JW300.cy.es 0.0 0.025
cy fi JW300.cy.fi 0.3 0.067
cy fr JW300.cy.fr 8.7 0.266
cy sv JW300.cy.sv 6.6 0.218
da de Tatoeba.da.de 57.4 0.741
da en Tatoeba.da.en 63.6 0.769
da es Tatoeba.da.es 53.7 0.715
da fi Tatoeba.da.fi 39.0 0.629
da+fo+is+no+nb+nn+sv da+fo+is+no+nb+nn+sv Tatoeba.da.sv 69.2 0.811
da fr Tatoeba.da.fr 62.2 0.751
de ab JW300.de.ab 1.0 0.124
de ach JW300.de.ach 3.6 0.173
de ada JW300.de.ada 6.5 0.196
de aed JW300.de.aed 3.1 0.150
de af Tatoeba.de.af 49.9 0.703
de am JW300.de.am 12.2 0.252
de ar Tatoeba.de.ar 14.7 0.456
de ase JW300.de.ase 30.4 0.483
de as JW300.de.as 1.4 0.122
de ay JW300.de.ay 5.2 0.239
de az_IR+az JW300.de.az 13.4 0.342
de ba JW300.de.ba 1.9 0.132
de bas JW300.de.bas 4.2 0.167
de bci JW300.de.bci 9.4 0.248
de bcl JW300.de.bcl 34.6 0.563
de bem JW300.de.bem 19.2 0.434
de be_tarask+be Tatoeba.de.be 3.1 0.106
de cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh bible-uedin.de.zh 21.9 0.293
de de Tatoeba.de.de 40.8 0.617
de bg GlobalVoices.de.bg 19.4 0.463
de bi JW300.de.bi 25.7 0.450
de bn GlobalVoices.de.bn 1.3 0.182
de bum JW300.de.bum 5.0 0.182
de bzs JW300.de.bzs 21.0 0.389
de cab JW300.de.cab 2.7 0.176
de cak JW300.de.cak 0.8 0.116
de ca Tatoeba.de.ca 34.0 0.552
de cat JW300.de.cat 3.0 0.157
de ceb Tatoeba.de.ceb 8.9 0.412
de chk JW300.de.chk 15.9 0.364
de cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh bible-uedin.de.zh 24.4 0.335
de crp bible-uedin.de.crp 4.4 0.202
de crs JW300.de.crs 24.1 0.429
de csg JW300.de.csg 3.7 0.169
de csl JW300.de.csl 2.2 0.044
de csn JW300.de.csn 2.7 0.154
de cs Tatoeba.de.cs 42.2 0.625
de cv JW300.de.cv 2.0 0.144
de cy JW300.de.cy 4.1 0.187
de de Tatoeba.de.de 40.7 0.616
de dhv JW300.de.dhv 5.6 0.241
de dje bible-uedin.de.dje 4.9 0.223
de ee JW300.de.ee 24.6 0.463
de efi JW300.de.efi 24.2 0.451
de el Tatoeba.de.el 45.7 0.649
de en newssyscomb2009.de.en 28.6 0.553
de en news-test2008.de.en 27.6 0.547
de en newstest2009.de.en 26.9 0.544
@ -219,10 +293,16 @@ de en newstest2017-ende.de.en 35.6 0.609
de en newstest2018-ende.de.en 43.8 0.667
de en newstest2019-deen.de.en 39.6 0.637
de en Tatoeba.de.en 55.1 0.704
de eo Tatoeba.de.eo 48.6 0.673
de es Tatoeba.de.es 48.5 0.676
de et JW300.de.et 20.2 0.465
de eu bible-uedin.de.eu 0.3 0.132
de fa GlobalVoices.de.fa 4.8 0.262
de fi goethe-institute-test1.de.fi 18.3 0.493
de fi goethe-institute-test2.de.fi 18.0 0.494
de fi Tatoeba.de.fi 40.0 0.628
de fj JW300.de.fj 24.6 0.470
de fon JW300.de.fon 4.5 0.156
de fr euelections_dev2019.transformer-align.de 32.2 0.590
de fr newssyscomb2009.de.fr 26.8 0.553
de fr news-test2008.de.fr 26.4 0.548
@ -233,6 +313,25 @@ de fr newstest2012.de.fr 27.7 0.554
de fr newstest2013.de.fr 29.5 0.560
de fr newstest2019-defr.de.fr 36.6 0.625
de fr Tatoeba.de.fr 49.2 0.664
de fse JW300.de.fse 3.2 0.180
de gaa JW300.de.gaa 26.3 0.471
de gd bible-uedin.de.gd 0.0 0.095
de gil JW300.de.gil 24.0 0.472
de guc JW300.de.guc 2.1 0.194
de gug JW300.de.gug 7.2 0.241
de gu JW300.de.gu 2.7 0.129
de guw JW300.de.guw 27.1 0.472
de gv bible-uedin.de.gv 0.0 0.028
de gym JW300.de.gym 3.4 0.218
de ha JW300.de.ha 20.7 0.417
de hi JW300.de.hi 4.2 0.162
de hil JW300.de.hil 33.9 0.563
de ho JW300.de.ho 22.6 0.461
de hsb Tatoeba.de.hsb 0.1 0.042
de ht JW300.de.ht 21.8 0.390
de hu Tatoeba.de.hu 34.3 0.588
de hy JW300.de.hy 9.9 0.274
de ia Tatoeba.de.ia 0.2 0.088
de+nl+fy+af+da+fo+is+no+nb+nn+sv de+nl+fy+af+da+fo+is+no+nb+nn+sv Tatoeba.de.sv 48.1 0.663
de pt_br+pt_BR+pt_PT+pt Tatoeba.de.pt 35.2 0.577
dhv en JW300.dhv.en 4.7 0.190