mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 22:14:14 +03:00
sami language task added
This commit is contained in:
parent
14f6ef808a
commit
f4fdb304a5
@ -13,6 +13,18 @@ ifndef THREADS
|
||||
endif
|
||||
|
||||
|
||||
## SKIP_LANGPAIRS can be used to skip certain language pairs
|
||||
## in data preparation for multilingual models
|
||||
## ---> this can be good to skip BIG language pairs
|
||||
## that would very much dominate all the data
|
||||
## must be a pattern that can be matched by egrep
|
||||
## e.g. en-de|en-fr
|
||||
|
||||
ifndef SKIP_LANGPAIRS
|
||||
SKIP_LANGPAIRS = "nothing"
|
||||
endif
|
||||
|
||||
|
||||
## look for cleanup scripts and put them into a pipe
|
||||
## they should be executable and should basically read STDIN and print to STDOUT
|
||||
## no further arguments are supported
|
||||
@ -66,9 +78,6 @@ REV_LANGSTR = ${subst ${SPACE},+,$(TRGLANGS)}-${subst ${SPACE},+,$(SRCLANGS)}
|
||||
REV_WORKDIR = ${WORKHOME}/${REV_LANGSTR}
|
||||
|
||||
|
||||
ttt:
|
||||
echo ${CLEAN_TRAIN_SRC}
|
||||
echo ${CLEAN_TRAIN_TRG}
|
||||
|
||||
reverse-data:
|
||||
ifeq (${PRE_SRC},${PRE_TRG})
|
||||
@ -339,7 +348,6 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
@echo "done!"
|
||||
|
||||
|
||||
|
||||
## add training data for each language combination
|
||||
## and put it together in local space
|
||||
${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
|
||||
@ -347,12 +355,16 @@ ${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
|
||||
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
||||
-for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ ${HELDOUTSIZE} -gt 0 ]; then \
|
||||
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
|
||||
add-to-local-train-and-heldout-data; \
|
||||
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
|
||||
if [ ${HELDOUTSIZE} -gt 0 ]; then \
|
||||
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
|
||||
add-to-local-train-and-heldout-data; \
|
||||
else \
|
||||
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
|
||||
add-to-local-train-data; \
|
||||
fi \
|
||||
else \
|
||||
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
|
||||
add-to-local-train-data; \
|
||||
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
@ -443,7 +455,11 @@ ${DEV_SRC}.shuffled.gz:
|
||||
rm -f ${DEV_SRC} ${DEV_TRG}
|
||||
-for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
${MAKE} SRC=$$s TRG=$$t add-to-dev-data; \
|
||||
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRC=$$s TRG=$$t add-to-dev-data; \
|
||||
else \
|
||||
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
paste ${DEV_SRC} ${DEV_TRG} | shuf | gzip -c > $@
|
||||
@ -585,7 +601,11 @@ ifneq (${TESTSET},${DEVSET})
|
||||
else \
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
${MAKE} SRC=$$s TRG=$$t add-to-test-data; \
|
||||
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} SRC=$$s TRG=$$t add-to-test-data; \
|
||||
else \
|
||||
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
|
||||
fi \
|
||||
done \
|
||||
done; \
|
||||
if [ ${TESTSIZE} -lt `cat $@ | wc -l` ]; then \
|
||||
|
129
Makefile.tasks
129
Makefile.tasks
@ -398,6 +398,129 @@ ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
|
||||
|
||||
|
||||
|
||||
GIELLATEKNO_HOME = https://victorio.uit.no/biggies/trunk
|
||||
GIELLATEKNO_TM_HOME = ${GIELLATEKNO_HOME}/mt/omegat
|
||||
|
||||
GIELLATEKNO_SAMI_TM = fin-smn/tm/finsmn.tmx \
|
||||
fin-sme/tm/finsme.tmx \
|
||||
fin-sms/tm/finsms.tmx \
|
||||
sme-smn/tm/smesmn.tmx \
|
||||
sme-smj/tm/smesmj.tmx \
|
||||
sme-nob/tm/smenob.tmx \
|
||||
sme-sma/tm/smesma.tmx \
|
||||
nob-smj/tm/nobsmj.tmx \
|
||||
nob-sme/tm/nobsme-2012.tmx \
|
||||
nob-sme/tm/nobsme-admin.tmx \
|
||||
nob-sme/tm/nobsme-bible.tmx \
|
||||
nob-sme/tm/nobsme-facta.tmx \
|
||||
nob-sme/tm/nobsme-laws.tmx \
|
||||
nob-sme/tm/nobsme-science.tmx \
|
||||
nob-sma/tm/nobsma.tmx \
|
||||
sma-nob/tm/smanob.tmx
|
||||
|
||||
|
||||
## glossaries
|
||||
|
||||
convert-sami-gloss:
|
||||
wget ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8
|
||||
cut -f1 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz
|
||||
cut -f2 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.smn.gz
|
||||
rm -f finsmn.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8
|
||||
cut -f1 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz
|
||||
cut -f2 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.se.gz
|
||||
rm -f finsme.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8
|
||||
cut -f1 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.fi.gz
|
||||
cut -f2 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.se.gz
|
||||
rm -f finsms.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8
|
||||
cut -f1 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smn.clean.se.gz
|
||||
cut -f2 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smn.clean.smn.gz
|
||||
rm -f smesmn.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8
|
||||
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smj.clean.se.gz
|
||||
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smj.clean.smj.gz
|
||||
rm -f glossary.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8
|
||||
cut -f1 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-sme.clean.se
|
||||
cut -f2 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb
|
||||
cut -f1 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sme.clean.se
|
||||
cut -f2 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb
|
||||
gzip -f ${DATADIR}/${PRE}/glossary.nb-sme.clean.se
|
||||
gzip -f ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb
|
||||
rm -f smenob.utf8 termwiki.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8
|
||||
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sma-sme.clean.se.gz
|
||||
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sma-sme.clean.sma.gz
|
||||
rm -f glossary.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8
|
||||
cut -f1 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.nb.gz
|
||||
cut -f2 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.smj.gz
|
||||
rm -f nobsmj.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8
|
||||
cut -f1 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb.gz
|
||||
cut -f2 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-sme.clean.se.gz
|
||||
rm -f nobsme.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8
|
||||
wget ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8
|
||||
cut -f1 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
|
||||
cut -f2 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
|
||||
cut -f1 termwiki.utf8 >>${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
|
||||
cut -f2 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
|
||||
gzip -f ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
|
||||
gzip -f ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
|
||||
rm -f nobsma.utf8 termwiki.utf8
|
||||
|
||||
fetch-sami-tmx: ${GIELLATEKNO_SAMI_TM}
|
||||
convert-sami-tmx:
|
||||
for t in ${GIELLATEKNO_SAMI_TM}; do \
|
||||
mkdir -p ${DATADIR}/sami; \
|
||||
tmx2moses -r -o ${DATADIR}/sami/`echo -n $$t | xargs basename | sed 's/.tmx//'` $$t; \
|
||||
done
|
||||
|
||||
move-sami-data:
|
||||
for f in `ls ${DATADIR}/sami`; do \
|
||||
gzip -c < ${DATADIR}/sami/$$f \
|
||||
> ${DATADIR}/${PRE}/`echo -n $$f | sed 's/\.\([^.]*\)$$/.clean.\1.gz/'`; \
|
||||
done
|
||||
|
||||
${GIELLATEKNO_SAMI_TM}:
|
||||
mkdir -p ${dir $@}
|
||||
wget -O $@ ${GIELLATEKNO_TM_HOME}/$@
|
||||
|
||||
|
||||
|
||||
## name of the data sets
|
||||
SAMI_EXTRA = ${patsubst %.tmx,%,${notdir ${GIELLATEKNO_SAMI_TM}}} glossary
|
||||
|
||||
sami-multilingual:
|
||||
${MAKE} DATASET=opus+giella \
|
||||
HELDOUTSIZE=0 \
|
||||
DEVSET=Tatoeba \
|
||||
TESTSET=Tatoeba \
|
||||
DEVMINSIZE=200 \
|
||||
EXTRA_TRAINSET="${SAMI_EXTRA}" \
|
||||
SRCLANGS="se sma smn sms smj et fi kv krl nb no nn ru sv en" \
|
||||
TRGLANGS="se sma smn sms smj et fi kv krl nb no nn ru sv en" \
|
||||
SKIP_LANGPAIRS="en-en|en-et|en-fi|en-nb|en-no|en-nn|en-ru|en-sv|et-et|et-fi|et-nb|et-no|et-nn|et-ru|et-sv|fi-fi|fi-nb|fi-no|fi-nn|fi-ru|fi-sv|nb-nb|nb-no|nb-nn|nb-ru|nb-sv|no-no|no-nn|no-ru|no-sv|nn-nn|nn-ru|nn-sv|ru-ru|ru-sv|sv-sv" \
|
||||
train-dynamic
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
test-skip:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
|
||||
echo "skip $$s-$$t"; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -410,6 +533,12 @@ romance-english:
|
||||
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en train-dynamic
|
||||
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer TRGLANGS="${LANGS_ROMANCE}" SRCLANGS=en train-dynamic
|
||||
|
||||
romance-english-eval:
|
||||
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en eval
|
||||
|
||||
romance-english-dist:
|
||||
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en best_dist
|
||||
|
||||
|
||||
|
||||
## germanic to germanic
|
||||
|
@ -87,7 +87,7 @@ all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
||||
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
|
||||
echo "${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit"; \
|
||||
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
|
||||
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 submit-translate-all-parts; \
|
||||
fi \
|
||||
done
|
||||
|
||||
@ -113,6 +113,15 @@ celtic:
|
||||
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
|
||||
done
|
||||
|
||||
small-romance:
|
||||
for l in wa frp oc ca rm lld fur lij lmo gl lad an mwl co nap scn vec sc la; do \
|
||||
${MAKE} SRC=$$l TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en all-wikis; \
|
||||
done
|
||||
|
||||
|
||||
wiki-gl-en:
|
||||
${MAKE} SRC=gl TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en translate
|
||||
|
||||
|
||||
|
||||
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
|
||||
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,15 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/ab-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ab-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ab-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.ab.en | 2.9 | 0.144 |
|
||||
| Tatoeba.ab.en | 2.3 | 0.097 |
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1,27 +0,0 @@
|
||||
Сара азба сакуеит.
|
||||
Акәашара бзиa бoба?
|
||||
Cаҟара cыцлеит.
|
||||
Маҷк cааҧсеит.
|
||||
Абиблиотека абаҟоу?
|
||||
Сыцәар сҭахыуп.
|
||||
Иҭабуп!
|
||||
Англыз бызшәа жәдыруама?
|
||||
Асааҭ шаҟоузеи?
|
||||
Сара сцоит.
|
||||
Иахьа шәахьоуп.
|
||||
Бурӡ Халифа иахьатәила зегь реицкис иреиҳау хыб ҳәа иашьҭоуп.
|
||||
Маҷк cааҧсеит.
|
||||
Сара азба сакуеит.
|
||||
Иуфазеи?
|
||||
Шьыжьбзиа!
|
||||
Сара ашә сфоит.
|
||||
Сара акәац сфоит.
|
||||
Сыӡбалқәас рхы надырхәозеи аҧсуаа?
|
||||
Сара Лори сыхьӡуп.
|
||||
Сара цәыкьа сышьҭалоит.
|
||||
Aиашьа дyмоумаҵ?
|
||||
Cара yаҵәы аpахь caaуеит.
|
||||
Уи cыҧcықәаpа ауп.
|
||||
Сара исҳәоит.
|
||||
Дaба сыҧшаауeиҵ?
|
||||
Бзиала шәаабеит Авикипедиахь.
|
@ -1,27 +0,0 @@
|
||||
I'm thirsty.
|
||||
Do you like dancing?
|
||||
I'm very happy.
|
||||
I'm a little tired.
|
||||
Where is the library?
|
||||
I want to sleep.
|
||||
Thank you!
|
||||
Do you speak English?
|
||||
What time is it?
|
||||
I'm going.
|
||||
Today is Monday.
|
||||
Burj Khalifa is currently the tallest skyscraper in the world.
|
||||
I'm a little bit tired.
|
||||
I am thirsty.
|
||||
What did you eat?
|
||||
Good morning.
|
||||
I eat cheese.
|
||||
I eat meat.
|
||||
Which sauces do the Abkhazians use?
|
||||
My name is Laurie.
|
||||
I'll go to bed early.
|
||||
Do you have a brother?
|
||||
I'll be here tomorrow.
|
||||
I really like it.
|
||||
I will say.
|
||||
Where can I find it?
|
||||
Welcome to Wikipedia.
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,14 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/ach-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ach-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ach-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.ach.en | 5.4 | 0.207 |
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,14 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/acu-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/acu-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/acu-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| bible-uedin.acu.en | 3.8 | 0.202 |
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = second top 5000 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 5000 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 1000 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 1000 lines of bible-uedin.src.shuffled!
|
@ -1,14 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/ada-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ada-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ada-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.ada.en | 4.3 | 0.182 |
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,14 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/aed-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/aed-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/aed-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.aed.en | 4.0 | 0.177 |
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/test/Tatoeba.trg
|
@ -1 +0,0 @@
|
||||
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/test/Tatoeba.src
|
@ -1 +0,0 @@
|
||||
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.trg
|
@ -1 +0,0 @@
|
||||
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.src.shuffled.gz
|
@ -1 +0,0 @@
|
||||
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.src
|
@ -1,14 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| Tatoeba.af.en | 60.8 | 0.736 |
|
@ -1 +0,0 @@
|
||||
testset = top 1000 lines of ../val/Tatoeba.src.shuffled!
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
devset = top 1000 lines of Tatoeba.src.shuffled!
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -1,14 +0,0 @@
|
||||
# opus-2020-01-08.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-fi/opus-2020-01-08.zip)
|
||||
* test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fi/opus-2020-01-08.test.txt)
|
||||
* test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fi/opus-2020-01-08.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.af.fi | 32.3 | 0.576 |
|
@ -1 +0,0 @@
|
||||
testset = second top 5000 lines of ../val/JW300.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 5000 lines of JW300.src.shuffled!
|
@ -1,14 +0,0 @@
|
||||
# opus-2020-01-08.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-fr/opus-2020-01-08.zip)
|
||||
* test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fr/opus-2020-01-08.test.txt)
|
||||
* test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fr/opus-2020-01-08.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.af.fr | 35.3 | 0.543 |
|
@ -1,14 +0,0 @@
|
||||
# opus-2020-01-08.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-sv/opus-2020-01-08.zip)
|
||||
* test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-sv/opus-2020-01-08.test.txt)
|
||||
* test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-sv/opus-2020-01-08.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.af.sv | 40.4 | 0.599 |
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,14 +0,0 @@
|
||||
# opus-2019-12-18.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/agr-en/opus-2019-12-18.zip)
|
||||
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/agr-en/opus-2019-12-18.test.txt)
|
||||
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/agr-en/opus-2019-12-18.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| bible-uedin.agr.en | 4.5 | 0.222 |
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = second top 5000 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 5000 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 1000 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 1000 lines of bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
devset = top 250 lines of GNOME.src.shuffled!
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
@ -1 +0,0 @@
|
||||
devset = top 250 lines of bible-uedin.src.shuffled!
|
@ -1,2 +0,0 @@
|
||||
testset = top 250 lines of ../val/GNOME.src.shuffled!
|
||||
testset = top 250 lines of ../val/bible-uedin.src.shuffled!
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user