tico19 benchmark added

This commit is contained in:
Joerg Tiedemann 2020-10-27 23:48:09 +02:00
parent 40a6b5ab6b
commit 1186d9afd5
164 changed files with 138 additions and 10 deletions

View File

@ -172,3 +172,16 @@ Various lists and tables can be generated from the evaluated model files. Remove
rm -f tatoeba-results* results/*.md
make tatoeba-results-md
```
## Convert models to huggingface
Look at this [README](https://github.com/huggingface/transformers/tree/master/scripts/tatoeba) in the transformers library.
```
python src/transformers/convert_marian_tatoeba_to_pytorch.py --models heb-eng eng-heb --save_dir converted
cd converted
transformers-cli login
for FILE in *; do transformers-cli upload --organization Helsinki-NLP $FILE; done
```

View File

@ -283,9 +283,9 @@ endif
.PHONY: upload
upload:
find ${MODELSHOME}/ -type l | tar -cf models-links.tar -T -
find ${MODELSHOME}/ -type l -delete
cd ${MODELSHOME} && swift upload ${MODEL_CONTAINER} --changed --skip-identical *
find ${RELEASEDIR}/ -type l | tar -cf models-links.tar -T -
find ${RELEASEDIR}/ -type l -delete
cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical *
tar -xf models-links.tar
rm -f models-links.tar
swift post ${MODEL_CONTAINER} --read-acl ".r:*"
@ -296,8 +296,8 @@ upload:
.PHONY: upload-models
upload-models:
find ${WORKHOME}/models -type l | tar -cf dev-models-links.tar -T -
find ${WORKHOME}/models -type l -delete
find ${MODELSHOME} -type l | tar -cf dev-models-links.tar -T -
find ${MODELSHOME} -type l -delete
cd ${WORKHOME} && swift upload ${DEV_MODEL_CONTAINER} --changed --skip-identical models
tar -xf dev-models-links.tar
rm -f dev-models-links.tar
@ -352,12 +352,12 @@ ${WORKHOME}/eval/scores.txt: ${EVALSCORES} ${EVALTRANSL}
rm -f $@.1 $@.2 $@.3 $@.4
${EVALSCORES}: # ${WORKHOME}/eval/%.eval.txt: ${WORKHOME}/models/%.eval
${EVALSCORES}: # ${WORKHOME}/eval/%.eval.txt: ${MODELSHOME}/%.eval
mkdir -p ${dir $@}
cp ${patsubst ${WORKHOME}/eval/%.eval.txt,${WORKHOME}/%.eval,$@} $@
# cp $< $@
${EVALTRANSL}: # ${WORKHOME}/eval/%.test.txt: ${WORKHOME}/models/%.compare
${EVALTRANSL}: # ${WORKHOME}/eval/%.test.txt: ${MODELSHOME}/%.compare
mkdir -p ${dir $@}
cp ${patsubst ${WORKHOME}/eval/%.test.txt,${WORKHOME}/%.compare,$@} $@
# cp $< $@
@ -429,7 +429,7 @@ old-models-dist:
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" \
WORKHOME=${WORKHOME}/old-models \
MODELSHOME=${WORKHOME}/models dist; \
MODELSHOME=${MODELSHOME} dist; \
done
@echo "trained double ${words ${TRAINED_DOUBLE_MODELS}}"
for l in ${TRAINED_DOUBLE_MODELS}; do \
@ -439,7 +439,7 @@ old-models-dist:
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" \
WORKHOME=${WORKHOME}/old-models \
MODELSHOME=${WORKHOME}/models dist; \
MODELSHOME=${MODELSHOME} dist; \
else \
echo "$$l: new better than old"; \
fi \

View File

@ -10,7 +10,8 @@ as-en:
${MAKE} train-dynamic-en-as
# ENAS_BPE = 4000
ENAS_BPE = 1000
ENAS_BPE = 1000
ENBCL_BPE = 1000
%-as-en:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=${ENAS_BPE} \
@ -23,3 +24,9 @@ ENAS_BPE = 1000
${@:-en-as=}
%-en-bcl:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=${ENBCL_BPE} \
SRCLANGS="en" TRGLANGS="bcl" EXCLUDE_CORPORA="WMT-News MPC1 wikimedia" \
${@:-en-bcl=}

View File

@ -7,6 +7,32 @@ TXT = $(patsubst %.sgm,%,${wildcard *.sgm})
all: ${TXT}
## TICO-19 translation benchmark
## from https://tico-19.github.io/index.html
TICO19_TEST = ${patsubst tico19-testset/test/test.%.tsv,%/tico19-test.en.gz,${wildcard tico19-testset/test/*.tsv}}
tico19-testset:
wget https://tico-19.github.io/data/tico19-testset.zip
unzip tico19-testset.zip
rm -f tico19-testset.zip
rm -fr __MACOSX
${MAKE} tico19-testdata
.PHONY: tico19-testdata
tico19-testdata: ${TICO19_TEST}
${TICO19_TEST}: %/tico19-test.en.gz: tico19-testset/test/test.%.tsv
mkdir -p ${dir $@}
cut -f3 $< | tail -n +2 | sed 's/^ *//;s/ *$$//' | gzip -c > $@
cut -f4 $< | tail -n +2 | sed 's/^ *//;s/ *$$//' | gzip -c > ${@:en.gz=${patsubst en-%/,%,$(dir $@)}}.gz
list-files:
@echo "${2LETTER_FILES}" | tr ' ' "\n"

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.am.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ar.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.bn.gz

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.fa.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.fr.gz

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ha.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.hi.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.id.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.km.gz

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.kr.gz

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ku.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.lg.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.ln.gz

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

View File

@ -0,0 +1 @@
tico19-test.mr.gz

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ms.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.my.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ne.gz

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.om.gz

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ps.gz

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.en.gz

Binary file not shown.

View File

@ -0,0 +1 @@
tico19-test.ru.gz

Some files were not shown because too many files have changed in this diff Show More