tokenizer

This commit is contained in:
Joerg Tiedemann 2022-05-16 20:44:19 +03:00
commit 84bb830773
10 changed files with 62 additions and 11 deletions

View File

@ -253,6 +253,9 @@ install install-prerequisites install-prereq install-requirements:
${LOAD_BUILD_ENV} && ${PIP} install --user -r requirements.txt
${LOAD_BUILD_ENV} && ${MAKE} install-perl-modules
${LOAD_BUILD_ENV} && ${MAKE} ${PREREQ_TOOLS}
if [ ! -e scores ]; then \
ln -s OPUS-MT-leaderboard/scores scores; \
fi
PHONY: install-all
install-all: install

View File

@ -153,6 +153,13 @@ elg-gmq2ukr-tiny11:
STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr \
LANGPAIRSTR="gmq-ukr" train-tiny11-student
elg-gmq2ukr-small:
${MAKE} MARIAN_EXTRA=--no-restore-corpus \
MARIAN_EARLY_STOPPING=15 \
STUDENT_DATA=ftbest-ftmono-nopar \
SRCLANGS="dan nob swe" TRGLANGS=ukr \
LANGPAIRSTR="gmq-ukr" train-small-student
## tiny11 transformer model for finnish with pivot data (reuse student recipes)
@ -240,6 +247,32 @@ elg-ukr2deu-student3:
${MAKE} MODELZIP=https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-deu/opusTCv20210807_transformer-big_2022-03-14.zip \ SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-bt-nopar train-tiny11-student
elg-deu2ukr-student4:
${MAKE} MARIAN_EARLY_STOPPING=15 STUDENT_DATA=ftbest-ftmono-nopar SRCLANGS=deu TRGLANGS=ukr train-small-student
elg-ukr2deu-student4:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-ftmono-nopar train-small-student
elg-ukr2gmq-small:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=swe STUDENT_DATA=ftbest-ftmono-nopar train-small-student
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=dan STUDENT_DATA=ftbest-ftmono-nopar train-small-student
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=nob STUDENT_DATA=ftbest-ftmono-nopar train-small-student
elg-dan2ukr-small:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=dan TRGLANGS=ukr STUDENT_DATA=ftbest-ftmono-nopar train-small-student
elg-swe2ukr-small:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=swe TRGLANGS=ukr STUDENT_DATA=ftbest-ftmono-nopar train-small-student
elg-nob2ukr-small:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=nob TRGLANGS=ukr STUDENT_DATA=ftbest-ftmono-nopar train-small-student
elg-fin2ukr-student2:
${MAKE} SUBWORD_VOCAB_SIZE=16000 MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
@ -290,13 +323,21 @@ elg-ukr2eng-student:
elg-eng2ukr-student2:
${MAKE} MARIAN_EARLY_STOPPING=15 STUDENT_DATA=ftbest-bt-nopar SRCLANGS=eng TRGLANGS=ukr train-tiny11-student
${MAKE} MARIAN_EARLY_STOPPING=15 STUDENT_DATA=ftbest-ftmono-nopar SRCLANGS=eng TRGLANGS=ukr train-tiny11-student
elg-ukr2eng-student2:
${MAKE} CONTINUE_EXISTING=1 MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=eng STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
elg-eng2ukr-student3:
${MAKE} MARIAN_EARLY_STOPPING=15 STUDENT_DATA=ftbest-ftmono-nopar SRCLANGS=eng TRGLANGS=ukr train-small-student
elg-ukr2eng-student3:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=eng STUDENT_DATA=ftbest-ftmono-nopar train-small-student
## missing evaluations and dist packages
## TODO: should probabubly also restart them!
## (also zls-zle and zle-zls)
@ -358,6 +399,8 @@ elg-new-bigmodels:
${MAKE} MODELTYPE=transformer-big tatoeba-vie2deu-trainjob
elg-zho:
${MAKE} MODELTYPE=transformer-big tatoeba-zho2eng-trainjob
elg-continue-missing:
@ -464,6 +507,12 @@ elg-zle2eng-xb:
${MAKE} MARIAN_EARLY_STOPPING=25 MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-zle2eng-trainjob-bt-xb
elg-fin2zle-xb:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-fin2zle-trainjob-pbt-pft-bt-xb
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-zle2fin-trainjob-pbt-pft-bt-xb
elg-pivot-eval:
for l in dan swe fin deu ron tur; do \
if [ -e work/$$l-ukr ]; then \
@ -906,7 +955,8 @@ ukr-model-table2:
# SCORE_BASE_URL = https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master
SCORE_BASE_URL = https://github.com/Helsinki-NLP/OPUS-MT-train/blob/puhti
# SCORE_BASE_URL = https://github.com/Helsinki-NLP/OPUS-MT-train/blob/puhti
SCORE_BASE_URL = https://github.com/Helsinki-NLP/OPUS-MT-leaderboard/blob/master
print-best-eng:

View File

@ -150,8 +150,8 @@ translate: ${BITEXT_LATEST_README} ${BITEXT_LATEST_TRG}
${MAKE} ${BITEXT_LATEST_SRC}
## translate all parts
.PHONY: translate-all-parts
translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
.PHONY: translate-all-parts translate-all
translate-all translate-all-parts: ${ALL_BITEXT_LATEST_TRG}
${MAKE} source-all-parts
.PHONY: source-all-parts

View File

@ -1 +0,0 @@
../flores101/nor.devtest.gz

View File

@ -1 +0,0 @@
../flores101/nor.devtest.gz

@ -1 +1 @@
Subproject commit 303f8f4f44fb1681b9edc7527d1715ac12b71a68
Subproject commit aa89dfa68089d46a06888d19e3d03ba11786210d

@ -1 +1 @@
Subproject commit 6575f72aac445e42ff490852161e066588208bc3
Subproject commit 199201eb89b2941afdadb14164e936d412f897ad

@ -1 +1 @@
Subproject commit a9f97e9e61a910a374a5d768244e8ad63f407d3e
Subproject commit f9afa950e26f5d548d955f92e83e6b8e10cc8438

@ -1 +1 @@
Subproject commit 601c9ac9807b5ffcbed298952435d9a17d954575
Subproject commit 23c36ec1a3c71cc75bc49fd3e39a4b1d8636589d

@ -1 +1 @@
Subproject commit 7bae758b2eac35168790ad9b617b668d541b44f2
Subproject commit 2598310dbedabebb582336d06dd91a5f60f33daa