Tatoeba test sets added

This commit is contained in:
Joerg Tiedemann 2020-01-17 12:43:53 +02:00
parent b0a586cbd0
commit 831acb1ae7
11285 changed files with 2482131 additions and 72 deletions

View File

@ -162,13 +162,13 @@ endif
## add also vocab size to the name
##-------------------------------------
# ifndef DATASET
# ifeq (${words ${TRAINSET}},1)
# DATASET = ${TRAINSET}
# else
# DATASET = opus
# endif
# endif
ifndef OLDDATASET
ifeq (${words ${TRAINSET}},1)
OLDDATASET = ${TRAINSET}
else
OLDDATASET = opus
endif
endif

View File

@ -363,40 +363,43 @@ ${DEV_SRC}: %: %.shuffled.gz
ifeq (${DEVSET},${TESTSET})
if (( `zcat $@.shuffled.gz | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
if (( `zcat $@.shuffled.gz | wc -l` < $$((${DEVSMALLSIZE} + ${DEVMINSIZE})) )); then \
echo "devset = top ${DEVMINSIZE} lines of ${notdir $@}.shuffled!" >> ${dir $@}/README; \
zcat $@.shuffled.gz | cut -f1 | head -${DEVMINSIZE} > ${DEV_SRC}; \
zcat $@.shuffled.gz | cut -f2 | head -${DEVMINSIZE} > ${DEV_TRG}; \
mkdir -p ${dir ${TEST_SRC}}; \
echo "testset = top ${DEVMINSIZE} lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README; \
zcat $@.shuffled.gz | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
zcat $@.shuffled.gz | cut -f2 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_TRG}; \
else \
echo "devset = top ${DEVSMALLSIZE} lines of ${notdir $@}.shuffled!" >> ${dir $@}/README; \
zcat $@.shuffled.gz | cut -f1 | head -${DEVSMALLSIZE} > ${DEV_SRC}; \
zcat $@.shuffled.gz | cut -f2 | head -${DEVSMALLSIZE} > ${DEV_TRG}; \
mkdir -p ${dir ${TEST_SRC}}; \
echo "testset = top ${DEVSMALLSIZE} lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README; \
zcat $@.shuffled.gz | cut -f1 | tail -n +$$((${DEVSMALLSIZE} + 1)) > ${TEST_SRC}; \
zcat $@.shuffled.gz | cut -f2 | tail -n +$$((${DEVSMALLSIZE} + 1)) > ${TEST_TRG}; \
fi; \
else \
echo "devset = top ${DEVSIZE} lines of ${notdir $@}.shuffled!" >> ${dir $@}/README; \
zcat $@.shuffled.gz | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}; \
zcat $@.shuffled.gz | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}; \
mkdir -p ${dir ${TEST_SRC}}; \
echo "testset = second top ${DEVSIZE} lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README; \
zcat $@.shuffled.gz | cut -f1 | head -$$((${DEVSIZE} + ${TESTSIZE})) | tail -${TESTSIZE} > ${TEST_SRC}; \
zcat $@.shuffled.gz | cut -f2 | head -$$((${DEVSIZE} + ${TESTSIZE})) | tail -${TESTSIZE} > ${TEST_TRG}; \
zcat $@.shuffled.gz | cut -f1 | tail -n +$$((${DEVSIZE} + ${TESTSIZE})) | gzip -c > ${DEV_SRC}.notused.gz; \
zcat $@.shuffled.gz | cut -f2 | tail -n +$$((${DEVSIZE} + ${TESTSIZE})) | gzip -c > ${DEV_TRG}.notused.gz; \
fi
else
echo "devset = top ${DEVSIZE} lines of ${notdir $@}.shuffled!" >> ${dir $@}/README
zcat $@.shuffled.gz | cut -f1 | head -${DEVSIZE} > ${DEV_SRC}
zcat $@.shuffled.gz | cut -f2 | head -${DEVSIZE} > ${DEV_TRG}
zcat $@.shuffled.gz | cut -f1 | tail -n +$$((${DEVSIZE} + 1)) | gzip -c > ${DEV_SRC}.notused.gz
zcat $@.shuffled.gz | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | gzip -c > ${DEV_TRG}.notused.gz
endif
echo -n "devset = top " >> ${dir ${DEV_SRC}}/README
wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README
echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README
ifeq (${DEVSET},${TESTSET})
echo -n "testset = last " >> ${dir ${TEST_SRC}}/README
wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README
echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README
endif
# zcat $@.shuffled.gz | cut -f1 | tail -${TESTSIZE} > ${TEST_SRC}; \
# zcat $@.shuffled.gz | cut -f2 | tail -${TESTSIZE} > ${TEST_TRG}; \

View File

@ -27,13 +27,25 @@ ALT_MODEL_DIR = spm
best_dist_all:
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
if [ `find work*/$$l -name '${DATASET}${TRAINSIZE}.*.npz' | wc -l` -gt 0 ]; then \
if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" best_dist; \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" \
DATASET=$$d best_dist; \
fi \
done
# best_dist_all:
# for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
# if [ `find work*/$$l -name '${DATASET}${TRAINSIZE}.*.npz' | wc -l` -gt 0 ]; then \
# ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
# TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" best_dist; \
# fi \
# done
## find the best model according to test set scores
## and make a distribution package from that model
## (BLEU needs to be above MIN_BLEU_SCORE)

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -0,0 +1,27 @@
Сара азба сакуеит.
Акәашара бзиa бoба?
Cаҟара cыцлеит.
Маҷк cааҧсеит.
Абиблиотека абаҟоу?
Сыцәар сҭахыуп.
Иҭабуп!
Англыз бызшәа жәдыруама?
Асааҭ шаҟоузеи?
Сара сцоит.
Иахьа шәахьоуп.
Бурӡ Халифа иахьатәила зегь реицкис иреиҳау хыб ҳәа иашьҭоуп.
Маҷк cааҧсеит.
Сара азба сакуеит.
Иуфазеи?
Шьыжьбзиа!
Сара ашә сфоит.
Сара акәац сфоит.
Сыӡбалқәас рхы надырхәозеи аҧсуаа?
Сара Лори сыхьӡуп.
Сара цәыкьа сышьҭалоит.
ашьа дyмоумаҵ?
Cара yаҵәы аpахь caaуеит.
Уи cыҧcықәаpа ауп.
Сара исҳәоит.
Дaба сыҧшаауeиҵ?
Бзиала шәаабеит Авикипедиахь.

View File

@ -0,0 +1,27 @@
I'm thirsty.
Do you like dancing?
I'm very happy.
I'm a little tired.
Where is the library?
I want to sleep.
Thank you!
Do you speak English?
What time is it?
I'm going.
Today is Monday.
Burj Khalifa is currently the tallest skyscraper in the world.
I'm a little bit tired.
I am thirsty.
What did you eat?
Good morning.
I eat cheese.
I eat meat.
Which sauces do the Abkhazians use?
My name is Laurie.
I'll go to bed early.
Do you have a brother?
I'll be here tomorrow.
I really like it.
I will say.
Where can I find it?
Welcome to Wikipedia.

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

View File

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = second top 5000 lines of ../val/bible-uedin.src.shuffled!

View File

View File

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 5000 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 1000 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 1000 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

View File

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

View File

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/test/Tatoeba.trg

View File

@ -0,0 +1 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/test/Tatoeba.src

View File

@ -0,0 +1 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.trg

View File

@ -0,0 +1 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.src.shuffled.gz

View File

@ -0,0 +1 @@
/scratch/project_2001194/Opus-MT-train/work-spm/de-af/val/Tatoeba.src

View File

@ -0,0 +1 @@
testset = top 1000 lines of ../val/Tatoeba.src.shuffled!

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
devset = top 1000 lines of Tatoeba.src.shuffled!

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
testset = second top 5000 lines of ../val/JW300.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 5000 lines of JW300.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = second top 5000 lines of ../val/bible-uedin.src.shuffled!

View File

View File

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 5000 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 1000 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 1000 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
testset = top 250 lines of ../val/GNOME.src.shuffled!
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

View File

@ -0,0 +1,2 @@
devset = top 250 lines of GNOME.src.shuffled!
devset = top 250 lines of bible-uedin.src.shuffled!

View File

@ -0,0 +1 @@
testset = top 250 lines of ../val/bible-uedin.src.shuffled!

Some files were not shown because too many files have changed in this diff Show More