sami language task added

This commit is contained in:
Joerg Tiedemann 2020-03-26 22:50:21 +02:00
parent 14f6ef808a
commit f4fdb304a5
2056 changed files with 170 additions and 483767 deletions

View File

@ -13,6 +13,18 @@ ifndef THREADS
endif
## SKIP_LANGPAIRS can be used to skip certain language pairs
## in data preparation for multilingual models
## ---> this can be good to skip BIG language pairs
## that would very much dominate all the data
## must be a pattern that can be matched by egrep
## e.g. en-de|en-fr
ifndef SKIP_LANGPAIRS
SKIP_LANGPAIRS = "nothing"
endif
## look for cleanup scripts and put them into a pipe
## they should be executable and should basically read STDIN and print to STDOUT
## no further arguments are supported
@ -66,9 +78,6 @@ REV_LANGSTR = ${subst ${SPACE},+,$(TRGLANGS)}-${subst ${SPACE},+,$(SRCLANGS)}
REV_WORKDIR = ${WORKHOME}/${REV_LANGSTR}
ttt:
echo ${CLEAN_TRAIN_SRC}
echo ${CLEAN_TRAIN_TRG}
reverse-data:
ifeq (${PRE_SRC},${PRE_TRG})
@ -339,7 +348,6 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
@echo "done!"
## add training data for each language combination
## and put it together in local space
${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
@ -347,12 +355,16 @@ ${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ${HELDOUTSIZE} -gt 0 ]; then \
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
add-to-local-train-and-heldout-data; \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
if [ ${HELDOUTSIZE} -gt 0 ]; then \
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
add-to-local-train-and-heldout-data; \
else \
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
add-to-local-train-data; \
fi \
else \
${MAKE} DATASET=${DATASET} SRC:=$$s TRG:=$$t \
add-to-local-train-data; \
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
fi \
done \
done
@ -443,7 +455,11 @@ ${DEV_SRC}.shuffled.gz:
rm -f ${DEV_SRC} ${DEV_TRG}
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
${MAKE} SRC=$$s TRG=$$t add-to-dev-data; \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
${MAKE} SRC=$$s TRG=$$t add-to-dev-data; \
else \
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
fi \
done \
done
paste ${DEV_SRC} ${DEV_TRG} | shuf | gzip -c > $@
@ -585,7 +601,11 @@ ifneq (${TESTSET},${DEVSET})
else \
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
${MAKE} SRC=$$s TRG=$$t add-to-test-data; \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
${MAKE} SRC=$$s TRG=$$t add-to-test-data; \
else \
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
fi \
done \
done; \
if [ ${TESTSIZE} -lt `cat $@ | wc -l` ]; then \

View File

@ -398,6 +398,129 @@ ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
GIELLATEKNO_HOME = https://victorio.uit.no/biggies/trunk
GIELLATEKNO_TM_HOME = ${GIELLATEKNO_HOME}/mt/omegat
GIELLATEKNO_SAMI_TM = fin-smn/tm/finsmn.tmx \
fin-sme/tm/finsme.tmx \
fin-sms/tm/finsms.tmx \
sme-smn/tm/smesmn.tmx \
sme-smj/tm/smesmj.tmx \
sme-nob/tm/smenob.tmx \
sme-sma/tm/smesma.tmx \
nob-smj/tm/nobsmj.tmx \
nob-sme/tm/nobsme-2012.tmx \
nob-sme/tm/nobsme-admin.tmx \
nob-sme/tm/nobsme-bible.tmx \
nob-sme/tm/nobsme-facta.tmx \
nob-sme/tm/nobsme-laws.tmx \
nob-sme/tm/nobsme-science.tmx \
nob-sma/tm/nobsma.tmx \
sma-nob/tm/smanob.tmx
## glossaries
convert-sami-gloss:
wget ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8
cut -f1 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz
cut -f2 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.smn.gz
rm -f finsmn.utf8
wget ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8
cut -f1 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz
cut -f2 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.se.gz
rm -f finsme.utf8
wget ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8
cut -f1 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.fi.gz
cut -f2 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.se.gz
rm -f finsms.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8
cut -f1 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smn.clean.se.gz
cut -f2 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smn.clean.smn.gz
rm -f smesmn.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smj.clean.se.gz
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sme-smj.clean.smj.gz
rm -f glossary.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8
cut -f1 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-sme.clean.se
cut -f2 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb
cut -f1 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sme.clean.se
cut -f2 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb
gzip -f ${DATADIR}/${PRE}/glossary.nb-sme.clean.se
gzip -f ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb
rm -f smenob.utf8 termwiki.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sma-sme.clean.se.gz
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.sma-sme.clean.sma.gz
rm -f glossary.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8
cut -f1 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.nb.gz
cut -f2 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.smj.gz
rm -f nobsmj.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8
cut -f1 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-sme.clean.nb.gz
cut -f2 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-sme.clean.se.gz
rm -f nobsme.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8
wget ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8
cut -f1 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
cut -f2 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
cut -f1 termwiki.utf8 >>${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
cut -f2 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
gzip -f ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
gzip -f ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
rm -f nobsma.utf8 termwiki.utf8
fetch-sami-tmx: ${GIELLATEKNO_SAMI_TM}
convert-sami-tmx:
for t in ${GIELLATEKNO_SAMI_TM}; do \
mkdir -p ${DATADIR}/sami; \
tmx2moses -r -o ${DATADIR}/sami/`echo -n $$t | xargs basename | sed 's/.tmx//'` $$t; \
done
move-sami-data:
for f in `ls ${DATADIR}/sami`; do \
gzip -c < ${DATADIR}/sami/$$f \
> ${DATADIR}/${PRE}/`echo -n $$f | sed 's/\.\([^.]*\)$$/.clean.\1.gz/'`; \
done
${GIELLATEKNO_SAMI_TM}:
mkdir -p ${dir $@}
wget -O $@ ${GIELLATEKNO_TM_HOME}/$@
## name of the data sets
SAMI_EXTRA = ${patsubst %.tmx,%,${notdir ${GIELLATEKNO_SAMI_TM}}} glossary
sami-multilingual:
${MAKE} DATASET=opus+giella \
HELDOUTSIZE=0 \
DEVSET=Tatoeba \
TESTSET=Tatoeba \
DEVMINSIZE=200 \
EXTRA_TRAINSET="${SAMI_EXTRA}" \
SRCLANGS="se sma smn sms smj et fi kv krl nb no nn ru sv en" \
TRGLANGS="se sma smn sms smj et fi kv krl nb no nn ru sv en" \
SKIP_LANGPAIRS="en-en|en-et|en-fi|en-nb|en-no|en-nn|en-ru|en-sv|et-et|et-fi|et-nb|et-no|et-nn|et-ru|et-sv|fi-fi|fi-nb|fi-no|fi-nn|fi-ru|fi-sv|nb-nb|nb-no|nb-nn|nb-ru|nb-sv|no-no|no-nn|no-ru|no-sv|nn-nn|nn-ru|nn-sv|ru-ru|ru-sv|sv-sv" \
train-dynamic
test-skip:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
echo "skip $$s-$$t"; \
fi \
done \
done
@ -410,6 +533,12 @@ romance-english:
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en train-dynamic
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer TRGLANGS="${LANGS_ROMANCE}" SRCLANGS=en train-dynamic
romance-english-eval:
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en eval
romance-english-dist:
${MAKE} HPC_DISK=1000 HELDOUTSIZE=0 MODELTYPE=transformer SRCLANGS="${LANGS_ROMANCE}" TRGLANGS=en best_dist
## germanic to germanic

View File

@ -87,7 +87,7 @@ all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit"; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 submit-translate-all-parts; \
fi \
done
@ -113,6 +113,15 @@ celtic:
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
done
small-romance:
for l in wa frp oc ca rm lld fur lij lmo gl lad an mwl co nap scn vec sc la; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en all-wikis; \
done
wiki-gl-en:
${MAKE} SRC=gl TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en translate
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,15 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/ab-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ab-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ab-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.ab.en | 2.9 | 0.144 |
| Tatoeba.ab.en | 2.3 | 0.097 |

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1,27 +0,0 @@
Сара азба сакуеит.
Акәашара бзиa бoба?
Cаҟара cыцлеит.
Маҷк cааҧсеит.
Абиблиотека абаҟоу?
Сыцәар сҭахыуп.
Иҭабуп!
Англыз бызшәа жәдыруама?
Асааҭ шаҟоузеи?
Сара сцоит.
Иахьа шәахьоуп.
Бурӡ Халифа иахьатәила зегь реицкис иреиҳау хыб ҳәа иашьҭоуп.
Маҷк cааҧсеит.
Сара азба сакуеит.
Иуфазеи?
Шьыжьбзиа!
Сара ашә сфоит.
Сара акәац сфоит.
Сыӡбалқәас рхы надырхәозеи аҧсуаа?
Сара Лори сыхьӡуп.
Сара цәыкьа сышьҭалоит.
ашьа дyмоумаҵ?
Cара yаҵәы аpахь caaуеит.
Уи cыҧcықәаpа ауп.
Сара исҳәоит.
Дaба сыҧшаауeиҵ?
Бзиала шәаабеит Авикипедиахь.

View File

@ -1,27 +0,0 @@
I'm thirsty.
Do you like dancing?
I'm very happy.
I'm a little tired.
Where is the library?
I want to sleep.
Thank you!
Do you speak English?
What time is it?
I'm going.
Today is Monday.
Burj Khalifa is currently the tallest skyscraper in the world.
I'm a little bit tired.
I am thirsty.
What did you eat?
Good morning.
I eat cheese.
I eat meat.
Which sauces do the Abkhazians use?
My name is Laurie.
I'll go to bed early.
Do you have a brother?
I'll be here tomorrow.
I really like it.
I will say.
Where can I find it?
Welcome to Wikipedia.

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,14 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/ach-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ach-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ach-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.ach.en | 5.4 | 0.207 |

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,14 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/acu-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/acu-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/acu-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.acu.en | 3.8 | 0.202 |

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = second top 5000 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 5000 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 1000 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 1000 lines of bible-uedin.src.shuffled!

View File

@ -1,14 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/ada-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ada-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ada-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.ada.en | 4.3 | 0.182 |

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,14 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/aed-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/aed-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/aed-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.aed.en | 4.0 | 0.177 |

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1 +0,0 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/test/Tatoeba.trg

View File

@ -1 +0,0 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/test/Tatoeba.src

View File

@ -1 +0,0 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.trg

View File

@ -1 +0,0 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.src.shuffled.gz

View File

@ -1 +0,0 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.src

View File

@ -1,14 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.af.en | 60.8 | 0.736 |

View File

@ -1 +0,0 @@
testset = top 1000 lines of ../val/Tatoeba.src.shuffled!

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
devset = top 1000 lines of Tatoeba.src.shuffled!

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +0,0 @@
# opus-2020-01-08.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-fi/opus-2020-01-08.zip)
* test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fi/opus-2020-01-08.test.txt)
* test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fi/opus-2020-01-08.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.af.fi | 32.3 | 0.576 |

View File

@ -1 +0,0 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -1,14 +0,0 @@
# opus-2020-01-08.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-fr/opus-2020-01-08.zip)
* test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fr/opus-2020-01-08.test.txt)
* test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-fr/opus-2020-01-08.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.af.fr | 35.3 | 0.543 |

View File

@ -1,14 +0,0 @@
# opus-2020-01-08.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/af-sv/opus-2020-01-08.zip)
* test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-sv/opus-2020-01-08.test.txt)
* test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/af-sv/opus-2020-01-08.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.af.sv | 40.4 | 0.599 |

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,14 +0,0 @@
# opus-2019-12-18.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/agr-en/opus-2019-12-18.zip)
* test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/agr-en/opus-2019-12-18.test.txt)
* test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/agr-en/opus-2019-12-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.agr.en | 4.5 | 0.222 |

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = second top 5000 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 5000 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 1000 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 1000 lines of bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -1 +0,0 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -1,2 +0,0 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

Some files were not shown because too many files have changed in this diff Show More