mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
new models
This commit is contained in:
parent
233083a8b8
commit
c573551713
@ -49,6 +49,9 @@ else
|
|||||||
TRGEXT = ${TRG}
|
TRGEXT = ${TRG}
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
## set additional argument options for opus_read (if it is used)
|
||||||
|
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
|
||||||
|
OPUSREAD_ARGS =
|
||||||
|
|
||||||
## all of OPUS (NEW: don't require MOSES format)
|
## all of OPUS (NEW: don't require MOSES format)
|
||||||
# OPUSCORPORA = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\
|
# OPUSCORPORA = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\
|
||||||
|
@ -243,7 +243,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|||||||
rm -f $@.zip ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
|
rm -f $@.zip ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
|
||||||
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
||||||
echo "extract $$c (${LANGPAIR}) from OPUS"; \
|
echo "extract $$c (${LANGPAIR}) from OPUS"; \
|
||||||
opus_read -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \
|
opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \
|
||||||
cut -f1 $@.tmp > $@; \
|
cut -f1 $@.tmp > $@; \
|
||||||
cut -f2 $@.tmp > ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
cut -f2 $@.tmp > ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||||
rm -f $@.tmp; \
|
rm -f $@.tmp; \
|
||||||
|
@ -97,7 +97,7 @@ best_dist:
|
|||||||
echo "------------------------------------------------"; \
|
echo "------------------------------------------------"; \
|
||||||
echo "search best model for ${LANGPAIRSTR}"; \
|
echo "search best model for ${LANGPAIRSTR}"; \
|
||||||
for d in ${ALT_MODEL_DIR}; do \
|
for d in ${ALT_MODEL_DIR}; do \
|
||||||
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
|
e=`ls work-$$d/${LANGPAIRSTR}/test/*.trg | tail -1 | xargs basename | sed 's/\.trg//'`; \
|
||||||
echo "evaldata = $$e"; \
|
echo "evaldata = $$e"; \
|
||||||
if [ "$$e" != "GNOME" ]; then \
|
if [ "$$e" != "GNOME" ]; then \
|
||||||
I=`find work-$$d/${LANGPAIRSTR}/ -maxdepth 1 -name "$$e.*.eval" -printf "%f\n"`; \
|
I=`find work-$$d/${LANGPAIRSTR}/ -maxdepth 1 -name "$$e.*.eval" -printf "%f\n"`; \
|
||||||
|
@ -70,6 +70,7 @@ MARIAN_EARLY_STOPPING = 5
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
all: model-index.txt
|
all: model-index.txt
|
||||||
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
||||||
@ -142,6 +143,24 @@ tmx-tune:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## awful hack to fix a problem with the pre-processing script for the target language
|
||||||
|
|
||||||
|
.PHONY: en-simplify
|
||||||
|
en-simplify:
|
||||||
|
${MAKE} SRC=en1 TRG=en2 \
|
||||||
|
MODEL=simplewiki_v1 \
|
||||||
|
SRCPRE_PARA="en en en1-en2/opus-2020-03-02/source.spm" \
|
||||||
|
TRGPRE_PARA="en en en1-en2/opus-2020-03-02/target.spm | sed 's/^>>en<< //'" \
|
||||||
|
BASEMODELHOME=../models/en+el+es+fi-en+el+es+fi \
|
||||||
|
BASEMODELZIP=opus-2020-03-02.zip \
|
||||||
|
TRAIN_SRC=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en1 \
|
||||||
|
TRAIN_TRG=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en2 \
|
||||||
|
DEV_SRC=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en1 \
|
||||||
|
DEV_TRG=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en2 \
|
||||||
|
TEST_SRC=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en1 \
|
||||||
|
TEST_TRG=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en2 \
|
||||||
|
all
|
||||||
|
|
||||||
|
|
||||||
.PHONY: news-enfi
|
.PHONY: news-enfi
|
||||||
news-enfi:
|
news-enfi:
|
||||||
@ -256,9 +275,13 @@ endif
|
|||||||
.PHONY: data
|
.PHONY: data
|
||||||
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
||||||
|
|
||||||
|
.PHONY: basemodel
|
||||||
|
basemodel: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
ifneq (${BASEMODELZIP},)
|
ifneq (${wildcard ${BASEMODELHOME}/${BASEMODELZIP}},)
|
||||||
|
unzip -u -d ${dir $@} ${BASEMODELHOME}/${BASEMODELZIP}
|
||||||
|
else ifneq (${BASEMODELZIP},)
|
||||||
wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
|
wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
|
||||||
cd ${dir $@} && unzip -u ${BASEMODELZIP}
|
cd ${dir $@} && unzip -u ${BASEMODELZIP}
|
||||||
else
|
else
|
||||||
@ -266,16 +289,19 @@ else
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
SRCPRE_PARA = ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm
|
||||||
|
TRGPRE_PARA = ${TRG} ${LANGPAIR}/${BASEMODELNAME}/target.spm
|
||||||
|
|
||||||
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
||||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
zcat $< |\
|
zcat $< |\
|
||||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
|
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRCPRE_PARA} |\
|
||||||
gzip -c > $@
|
gzip -c > $@
|
||||||
|
|
||||||
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
|
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
|
||||||
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
zcat $< |\
|
zcat $< |\
|
||||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
|
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${TRGPRE_PARA} |\
|
||||||
gzip -c > $@
|
gzip -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
36
models/en+el+es+fi-en+el+es+fi/README.md
Normal file
36
models/en+el+es+fi-en+el+es+fi/README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# opus-2020-03-02.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
|
||||||
|
* download: [opus-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.zip)
|
||||||
|
* test set translations: [opus-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.test.txt)
|
||||||
|
* test set scores: [opus-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| newsdev2015-enfi.en.fi | 16.0 | 0.498 |
|
||||||
|
| newssyscomb2009.en.es | 29.9 | 0.570 |
|
||||||
|
| newssyscomb2009.es.en | 29.7 | 0.569 |
|
||||||
|
| news-test2008.en.es | 27.3 | 0.549 |
|
||||||
|
| news-test2008.es.en | 27.3 | 0.548 |
|
||||||
|
| newstest2009.en.es | 28.4 | 0.564 |
|
||||||
|
| newstest2009.es.en | 28.4 | 0.564 |
|
||||||
|
| newstest2010.en.es | 34.0 | 0.599 |
|
||||||
|
| newstest2010.es.en | 34.0 | 0.599 |
|
||||||
|
| newstest2011.en.es | 35.1 | 0.600 |
|
||||||
|
| newstest2012.en.es | 35.4 | 0.602 |
|
||||||
|
| newstest2013.en.es | 31.9 | 0.576 |
|
||||||
|
| newstest2015-enfi.en.fi | 17.8 | 0.509 |
|
||||||
|
| newstest2016-enfi.en.fi | 19.0 | 0.521 |
|
||||||
|
| newstest2017-enfi.en.fi | 21.2 | 0.539 |
|
||||||
|
| newstest2018-enfi.en.fi | 13.9 | 0.478 |
|
||||||
|
| newstest2019-enfi.en.fi | 18.8 | 0.503 |
|
||||||
|
| newstestB2016-enfi.en.fi | 14.9 | 0.491 |
|
||||||
|
| newstestB2017-enfi.en.fi | 16.9 | 0.503 |
|
||||||
|
| simplification.en.en | 63.0 | 0.798 |
|
||||||
|
| Tatoeba.en.fi | 56.7 | 0.719 |
|
||||||
|
|
@ -11,5 +11,20 @@
|
|||||||
|
|
||||||
| testset | BLEU | chr-F |
|
| testset | BLEU | chr-F |
|
||||||
|-----------------------|-------|-------|
|
|-----------------------|-------|-------|
|
||||||
| Tatoeba.en.ml | 50.4 | 0.598 |
|
| Bible.en.ml | 50.4 | 0.598 |
|
||||||
|
|
||||||
|
# opus+bt-2020-03-02.zip
|
||||||
|
|
||||||
|
* dataset: opus+bt
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus+bt-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.zip)
|
||||||
|
* test set translations: [opus+bt-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.test.txt)
|
||||||
|
* test set scores: [opus+bt-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| Tatoeba.en.ml | 17.0 | 0.507 |
|
||||||
|
|
||||||
|
@ -43,3 +43,18 @@
|
|||||||
|-----------------------|-------|-------|
|
|-----------------------|-------|-------|
|
||||||
| Tatoeba.ml.en | 42.6 | 0.591 |
|
| Tatoeba.ml.en | 42.6 | 0.591 |
|
||||||
|
|
||||||
|
# opus-2020-03-01.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-03-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.zip)
|
||||||
|
* test set translations: [opus-2020-03-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.test.txt)
|
||||||
|
* test set scores: [opus-2020-03-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| Tatoeba.ml.en | 40.5 | 0.576 |
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
|
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# replace MOSESHOME and SNMTPATH with your own setup!
|
# replace MOSESHOME and SNMTPATH with your own setup!
|
||||||
@ -25,11 +25,20 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
|||||||
|
|
||||||
THREADS=4
|
THREADS=4
|
||||||
|
|
||||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
if [ $4 == "noflags" ]; then
|
||||||
${TOKENIZER}/remove-non-printing-char.perl |
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
${TOKENIZER}/remove-non-printing-char.perl |
|
||||||
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||||
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||||
sed "s/^/>>$2<< /"
|
python3 ${SNMTPATH}/apply_bpe.py -c $3
|
||||||
|
else
|
||||||
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||||
|
${TOKENIZER}/remove-non-printing-char.perl |
|
||||||
|
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||||
|
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||||
|
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
|
||||||
|
sed "s/^/>>$2<< /"
|
||||||
|
fi
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
|
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# replace MOSESHOME and SPMENCODE with your own setup!
|
# replace MOSESHOME and SPMENCODE with your own setup!
|
||||||
@ -21,11 +21,17 @@ fi
|
|||||||
MOSESSCRIPTS=${MOSESHOME}/scripts
|
MOSESSCRIPTS=${MOSESHOME}/scripts
|
||||||
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||||
|
|
||||||
|
if [ $4 == "noflags" ]; then
|
||||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||||
${TOKENIZER}/remove-non-printing-char.perl |
|
${TOKENIZER}/remove-non-printing-char.perl |
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||||
${SPMENCODE} --model $3 |
|
${SPMENCODE} --model $3
|
||||||
sed "s/^/>>$2<< /"
|
else
|
||||||
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||||
|
${TOKENIZER}/remove-non-printing-char.perl |
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||||
|
${SPMENCODE} --model $3 |
|
||||||
|
sed "s/^/>>$2<< /"
|
||||||
|
fi
|
||||||
|
|
||||||
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||||
|
Loading…
Reference in New Issue
Block a user