new sami model

This commit is contained in:
Joerg Tiedemann 2020-04-19 19:48:01 +03:00
parent e5e58d1a37
commit ea2b283ad4
10 changed files with 133 additions and 3 deletions

View File

@ -193,13 +193,16 @@ as-en:
${MAKE} reverse-data-as-en
${MAKE} train-dynamic-en-as
# ENAS_BPE = 4000
ENAS_BPE = 1000
%-as-en:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=4000 \
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=${ENAS_BPE} \
SRCLANGS="as" TRGLANGS="en" \
${@:-as-en=}
%-en-as:
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=4000 \
${MAKE} HELDOUTSIZE=0 DEVSIZE=1000 TESTSIZE=1000 DEVMINSIZE=100 BPESIZE=${ENAS_BPE} \
SRCLANGS="en" TRGLANGS="as" \
${@:-en-as=}

View File

@ -193,9 +193,12 @@ breton:
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis
assamese:
assamese-english:
${MAKE} SRC=as TRG=en MODELHOME=${HOME}/research/Opus-MT-train/work/models/as-en all-wikis
english-assamese:
${MAKE} SRC=en TRG=as MODELHOME=${HOME}/research/Opus-MT-train/work/models/en-as translate.submit

View File

@ -0,0 +1,16 @@
# opus+bt-2020-04-17.zip
* dataset: opus+bt
* model: transformer
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus+bt-2020-04-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la/opus+bt-2020-04-17.zip)
* test set translations: [opus+bt-2020-04-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la/opus+bt-2020-04-17.test.txt)
* test set scores: [opus+bt-2020-04-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la/opus+bt-2020-04-17.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.en.la | 49.9 | 0.691 |

View File

@ -0,0 +1,16 @@
# opus+giella-2020-04-18.zip
* dataset: opus+giella
* model: transformer-align
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus+giella-2020-04-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms/opus+giella-2020-04-18.zip)
* test set translations: [opus+giella-2020-04-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms/opus+giella-2020-04-18.test.txt)
* test set scores: [opus+giella-2020-04-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms/opus+giella-2020-04-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| giella.fi.sms | 58.4 | 0.776 |

15
models/fi-es/README.md Normal file
View File

@ -0,0 +1,15 @@
# opus-2020-04-12.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-es/opus-2020-04-12.zip)
* test set translations: [opus-2020-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-es/opus-2020-04-12.test.txt)
* test set scores: [opus-2020-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-es/opus-2020-04-12.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.fi.es | 51.5 | 0.700 |

15
models/fi-vi/README.md Normal file
View File

@ -0,0 +1,15 @@
# opus-2020-04-12.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-vi/opus-2020-04-12.zip)
* test set translations: [opus-2020-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-vi/opus-2020-04-12.test.txt)
* test set scores: [opus-2020-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-vi/opus-2020-04-12.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.fi.vi | 38.8 | 0.522 |

View File

@ -0,0 +1,16 @@
# opus+giella-2020-04-19.zip
* dataset: opus+giella
* model: transformer-align
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus+giella-2020-04-19.zip](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en/opus+giella-2020-04-19.zip)
* test set translations: [opus+giella-2020-04-19.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en/opus+giella-2020-04-19.test.txt)
* test set scores: [opus+giella-2020-04-19.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en/opus+giella-2020-04-19.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| giella.se.en | 56.0 | 0.768 |

View File

@ -0,0 +1,16 @@
# opus+giella-2020-04-18.zip
* dataset: opus+giella
* model: transformer-align
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus+giella-2020-04-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en/opus+giella-2020-04-18.zip)
* test set translations: [opus+giella-2020-04-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en/opus+giella-2020-04-18.test.txt)
* test set scores: [opus+giella-2020-04-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en/opus+giella-2020-04-18.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| giella.se.en | 59.9 | 0.789 |

15
models/tr-fi/README.md Normal file
View File

@ -0,0 +1,15 @@
# opus-2020-04-12.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/tr-fi/opus-2020-04-12.zip)
* test set translations: [opus-2020-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/tr-fi/opus-2020-04-12.test.txt)
* test set scores: [opus-2020-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/tr-fi/opus-2020-04-12.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.tr.fi | 34.9 | 0.585 |

15
models/vi-fi/README.md Normal file
View File

@ -0,0 +1,15 @@
# opus-2020-04-12.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-04-12.zip](https://object.pouta.csc.fi/OPUS-MT-models/vi-fi/opus-2020-04-12.zip)
* test set translations: [opus-2020-04-12.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/vi-fi/opus-2020-04-12.test.txt)
* test set scores: [opus-2020-04-12.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/vi-fi/opus-2020-04-12.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.vi.fi | 27.9 | 0.529 |