updates celtic model

This commit is contained in:
Joerg Tiedemann 2020-04-24 13:30:16 +03:00
parent ea2b283ad4
commit 9ba784419e
10 changed files with 131 additions and 12 deletions

View File

@ -118,7 +118,7 @@ best_dist:
done; \
fi \
done; \
echo "------------------------------------------------"; \
echo "--------------- best = $$m ($$s/$$E) ---------------------------------"; \
if [ "$$s" != "" ]; then \
if (( $$(echo "$$m > ${MIN_BLEU_SCORE}" |bc -l) )); then \
x=`echo $$E | cut -f3 -d. | cut -f1 -d-`; \

View File

@ -183,8 +183,6 @@ endif
${MAKE} WORKHOME=${shell realpath ${PWD}/work-nospace} \
PRE=simple \
SPMEXTRA=--split_by_whitespace=false \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-nospace=}
@ -211,17 +209,13 @@ endif
%-langid:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
PRE=simple \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-langid=}
## sentence-piece models with langid-filtering (new default)
%-langid-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid-noalign} \
MODELTYPE=transformer \
PRE=simple \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-langid-noalign=}

View File

@ -359,23 +359,26 @@ fiskmo-svfi-%:
## only OPUS data
# CELTIC_BPESIZE = 12000
CELTIC_BPESIZE = 4000
%-celtic-english-opus:
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kw gv" TRGLANGS=en ${@:-celtic-english-opus=}
${MAKE} HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} SRCLANGS="ga cy br gd kw gv" TRGLANGS=en ${@:-celtic-english-opus=}
%-english-celtic-opus:
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en ${@:-english-celtic-opus=}
${MAKE} HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en ${@:-english-celtic-opus=}
# more data for cy-en
%-celtic-english: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
SRCLANGS="ga cy br gd kw gv" TRGLANGS=en \
${@:-celtic-english=}
%-english-celtic: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
TRGLANGS="ga cy br gd kw gv" SRCLANGS=en TRG=ga SRC=en \
${@:-english-celtic=}

View File

@ -142,6 +142,35 @@ all-nordic-wikidocs:
done \
done
# ar: Arabic
# bg: Bulgarian
# de: German
# el: Greek
# en: English
# es: Spanish
# fr: French
# hi: Hindi
# ru: Russian
# sw: Swahili
# th: Thai
# tr: Turkish
# ur: Urdu
# vi: Vietnamese
# zh: Chinese (Simplified)
xnli-wikidocs:
for l in ar bg de el en es fr hi ru sw th tr ur vi zh; do \
for w in ${WIKISOURCES}; do \
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
done \
done
## en and es are too big to run through udpipe ....
big-wikidocs:
for l in en es; do \
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
SRC=$$l WIKISOURCE=wiki extract-doc; \
done

View File

@ -14,3 +14,19 @@
|-----------------------|-------|-------|
| Tatoeba.en.la | 49.9 | 0.691 |
# opus-2020-04-21.zip
* dataset: opus
* model: transformer
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus-2020-04-21.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la/opus-2020-04-21.zip)
* test set translations: [opus-2020-04-21.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la/opus-2020-04-21.test.txt)
* test set scores: [opus-2020-04-21.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la/opus-2020-04-21.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.en.la | 50.1 | 0.693 |

View File

@ -46,3 +46,19 @@
|-----------------------|-------|-------|
| Tatoeba.en.ga | 22.1 | 0.385 |
# opus+techiaith+bt-2020-04-24.zip
* dataset: opus+techiaith+bt
* model: transformer-align
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus+techiaith+bt-2020-04-24.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ga+cy+br+gd+kw+gv/opus+techiaith+bt-2020-04-24.zip)
* test set translations: [opus+techiaith+bt-2020-04-24.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ga+cy+br+gd+kw+gv/opus+techiaith+bt-2020-04-24.test.txt)
* test set scores: [opus+techiaith+bt-2020-04-24.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ga+cy+br+gd+kw+gv/opus+techiaith+bt-2020-04-24.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.en.ga | 22.8 | 0.404 |

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| Tatoeba.en.ml | 17.0 | 0.507 |
# opus-2020-04-20.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-04-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus-2020-04-20.zip)
* test set translations: [opus-2020-04-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus-2020-04-20.test.txt)
* test set scores: [opus-2020-04-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus-2020-04-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.en.ml | 18.3 | 0.531 |

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| Tatoeba.ga.en | 25.9 | 0.417 |
# opus+techiaith-2020-04-21.zip
* dataset: opus+techiaith
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus+techiaith-2020-04-21.zip](https://object.pouta.csc.fi/OPUS-MT-models/ga+cy+br+gd+kw+gv-en/opus+techiaith-2020-04-21.zip)
* test set translations: [opus+techiaith-2020-04-21.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ga+cy+br+gd+kw+gv-en/opus+techiaith-2020-04-21.test.txt)
* test set scores: [opus+techiaith-2020-04-21.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ga+cy+br+gd+kw+gv-en/opus+techiaith-2020-04-21.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.ga.en | 26.8 | 0.413 |

View File

@ -58,3 +58,18 @@
|-----------------------|-------|-------|
| Tatoeba.ml.en | 40.5 | 0.576 |
# opus-2020-04-20.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-04-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-04-20.zip)
* test set translations: [opus-2020-04-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-04-20.test.txt)
* test set scores: [opus-2020-04-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-04-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.ml.en | 42.7 | 0.605 |

View File

@ -14,3 +14,19 @@
|-----------------------|-------|-------|
| giella.se.en | 56.0 | 0.768 |
# opus+giella-2020-04-21.zip
* dataset: opus+giella
* model: transformer-align
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus+giella-2020-04-21.zip](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en/opus+giella-2020-04-21.zip)
* test set translations: [opus+giella-2020-04-21.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en/opus+giella-2020-04-21.test.txt)
* test set scores: [opus+giella-2020-04-21.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en/opus+giella-2020-04-21.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| giella.se.en | 56.6 | 0.771 |