diff --git a/backtranslate/Makefile b/backtranslate/Makefile index 37140b8f..89ace144 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -149,13 +149,13 @@ store-wiki: fetch-wiki fetch: mkdir -p wiki - wget -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar + ${WGET} -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar tar -C wiki -xf wiki/${SRC}.tar rm -f wiki/${SRC}.tar fetch-wikidoc: mkdir -p wikidoc - wget -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar + ${WGET} -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar tar -C wikidoc -xf wikidoc/${SRC}.tar rm -f wikidoc/${SRC}.tar @@ -463,7 +463,7 @@ giellatekno/${SRC}/corp.${SRC}.aa.gz: gzip -f giellatekno/${SRC}/corp.${SRC}.* victorio.uit.no/biggies/trunk/langs/${SRC}: - wget -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp + ${WGET} -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp giellatekno/se: giellatekno/sme -cd giellatekno && ln -s sme se @@ -689,11 +689,11 @@ endif ## index of all downloadable files index.html: - wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current + ${WGET} -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current ## wiki in json format ${WIKI_JSON}: - wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON} + ${WGET} -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON} diff --git a/lib/allas.mk b/lib/allas.mk index ccf864d1..a3071573 100644 --- a/lib/allas.mk +++ b/lib/allas.mk @@ -76,7 +76,7 @@ fetch-data: %.fetched: if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \ cd $(dir $@); \ - wget ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \ + ${WGET} ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \ tar -xf $(notdir $(@:.fetched=.tar)); \ rm -f $(notdir $(@:.fetched=.tar)); \ touch $(notdir $@); \ @@ -93,7 +93,7 @@ fetch-data: work-%/${LANGPAIRSTR}: mkdir -p $(dir $@) cd $(dir $@) && \ - wget ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar + ${WGET} ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar tar -C $(dir $@) -xf $(dir $@)${LANGPAIRSTR}.tar rm -f $(dir $@)${LANGPAIRSTR}.tar touch $@.fetched diff --git a/lib/config.mk b/lib/config.mk index 9b6aa8b6..48db8578 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -73,6 +73,25 @@ MODELTYPES = transformer \ + +## clean-corpus script parameters +## (for filtering subword-segmented bitexts) +## +## (TODO: should MIN_NTOKENS be 1?) +# MIN_NR_TOKENS = 0 +# MAX_NR_TOKENS = 250 +MIN_NR_TOKENS = 1 +MAX_NR_TOKENS = 500 +NR_TOKEN_RATIO = 2 +MAX_TOKEN_LENGTH = 100 + +## default values in the original script: +## +# MAX_TOKEN_LENGTH = 1000 +# NR_TOKEN_RATIO = 9 + + + ## name of the model-specific configuration file ## NEW: make it more model specific # @@ -234,7 +253,7 @@ OPUSREAD_ARGS = ## get available data from the OPUS-API OPUSAPI = http://opus.nlpl.eu/opusapi/ -OPUSAPI_WGET = wget -qq --no-check-certificate -O - ${OPUSAPI}? +OPUSAPI_WGET = ${WGET} -qq --no-check-certificate -O - ${OPUSAPI}? get-opus-mono = ${shell ${OPUSAPI_WGET}source=${1}\&corpora=True | ${JQ} '.corpora[]' | tr '"' ' '} get-opus-bitexts = ${shell ${OPUSAPI_WGET}source=${1}\&target=${2}\&corpora=True | ${JQ} '.corpora[]' | tr '"' ' '} @@ -816,7 +835,7 @@ endif ## TODO: do we still need this? ## --> see OPUSLANGS which is directly taken from the API opus-langs.txt: - wget -O $@.tmp ${OPUSAPI}?languages=true + ${WGET} -O $@.tmp ${OPUSAPI}?languages=true grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@ rm -f $@.tmp @@ -824,7 +843,7 @@ opus-langs.txt: ## TODO: do we need this file? opus-langpairs.txt: for l in ${OPUS_LANGS}; do \ - wget -O $@.tmp ${OPUSAPI}?source=$$l\&languages=true; \ + ${WGET} -O $@.tmp ${OPUSAPI}?source=$$l\&languages=true; \ grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@.tmp2; \ for t in `cat $@.tmp2`; do \ if [ $$t \< $$l ]; then \ diff --git a/lib/config/tatoeba.mk b/lib/config/tatoeba.mk index 4203daf1..177e467b 100644 --- a/lib/config/tatoeba.mk +++ b/lib/config/tatoeba.mk @@ -97,6 +97,8 @@ TATOEBA_LANGIDS_TRAINONLY = tatoeba/langids-train-only-${TATOEBA_VERSION}.txt # TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION} +# TATOEBA_RAWGIT_MASTER := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/master +# TATOEBA_RAWGIT_RELEASE := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/${TATOEBA_VERSION} ## data count files (file basename) @@ -107,13 +109,13 @@ RELEASED_TATOEBA_DATA_FILE = tatoeba/released-bitexts-${TATOEBA_VERSION}.txt ## all released language pairs with test sets > 200 test pairs ## also extract all source languages that are available for a give target language ## and vice versa -TATOEBA_RELEASED_DATA := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1) +TATOEBA_RELEASED_DATA := $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1) TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}} TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}} ## extract language pairs for a specific subset TATOEBA_SUBSET := lower -TATOEBA_RELEASED_SUBSET := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1) +TATOEBA_RELEASED_SUBSET := $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1) TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}} TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}} @@ -123,7 +125,7 @@ TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter ## all available language pairs ## (download the file once and keep it here to get the language pairs in the release) TATOEBA_LANGPAIRS := ${shell if [ ! -e ${RELEASED_TATOEBA_DATA_FILE} ]; then \ - wget -q -O ${RELEASED_TATOEBA_DATA_FILE} ${RELEASED_TATOEBA_DATA_URL}; \ + ${WGET} -q -O ${RELEASED_TATOEBA_DATA_FILE} ${RELEASED_TATOEBA_DATA_URL}; \ fi; \ tail -n +2 ${RELEASED_TATOEBA_DATA_FILE} | cut -f1 } diff --git a/lib/data.mk b/lib/data.mk index 9842f1bb..1bcb650b 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -407,7 +407,7 @@ $(LOCAL_TRAIN_SRC).algtmp.d/%.alg: $(LOCAL_TRAIN_SRC).algtmp.d/% $(LOCAL_TRAIN_T echo "============================================"; \ echo "fetch moses data from $$l"; \ echo "============================================"; \ - wget -qq -O $@-$$c-${LANGPAIR}.zip $$l; \ + ${WGET} -qq -O $@-$$c-${LANGPAIR}.zip $$l; \ unzip -d ${dir $@} -n $@-$$c-${LANGPAIR}.zip; \ mv ${dir $@}$$c*.${LANGPAIR}.${SRCEXT} $@; \ mv ${dir $@}$$c*.${LANGPAIR}.${TRGEXT} ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \ @@ -486,6 +486,27 @@ ifeq (${USE_REST_DEVDATA},1) ${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \ fi endif +###################################### +# run another round of cleaning if +# CLEAN_CORPUS_TRAINING_DATA is set +# --> could be useful if there is +# noisy data in back-translations etc +###################################### +ifeq (${CLEAN_CORPUS_TRAINING_DATA},1) + @echo ".... another cleanup of local training data" + @ln -s ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT} + @ln -s ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.${TRGEXT} + @$(MOSESSCRIPTS)/training/clean-corpus-n.perl \ + -ratio ${NR_TOKEN_RATIO} \ + -max-word-length ${MAX_TOKEN_LENGTH} \ + ${LOCAL_TRAIN_SRC} $(SRCEXT) $(TRGEXT) \ + ${LOCAL_TRAIN_SRC}.clean \ + ${MIN_NR_TOKENS} ${MAX_NR_TOKENS} + @mv -f ${LOCAL_TRAIN_SRC}.clean,${SRCEXT} ${LOCAL_TRAIN_SRC} + @mv -f ${LOCAL_TRAIN_SRC}.clean,${TRGEXT} ${LOCAL_TRAIN_TRG} + @rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT} + @rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${TRGEXT} +endif ifeq (${SHUFFLE_TRAINING_DATA},1) @echo ".... shuffle complete training data" @paste ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled @@ -495,6 +516,7 @@ ifeq (${SHUFFLE_TRAINING_DATA},1) endif + ## everything is done in the target above ${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC} @echo "done!" diff --git a/lib/dist.mk b/lib/dist.mk index eaa372f2..37edaf48 100644 --- a/lib/dist.mk +++ b/lib/dist.mk @@ -44,7 +44,7 @@ MODEL_YML = ${patsubst %.npz,%.yml,${MODEL_FINAL}} -get-model-release = ${shell wget -qq -O - ${MODELINDEX} | grep '^${1}/.*-.*\.zip' | LANG=en_US.UTF-8 sort -r} +get-model-release = ${shell ${WGET} -qq -O - ${MODELINDEX} | grep '^${1}/.*-.*\.zip' | LANG=en_US.UTF-8 sort -r} get-model-distro = ${shell echo ${wildcard ${1}/${2}/*.zip} | tr ' ' "\n" | LANG=en_US.UTF-8 sort -r} @@ -627,9 +627,9 @@ upload-models: fetch-model: mkdir -p ${RELEASEDIR}/${LANGPAIRSTR} cd ${RELEASEDIR}/${LANGPAIRSTR} && \ - wget ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${firstword ${call get-model-release,${LANGPAIRSTR}}} + ${WGET} ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${firstword ${call get-model-release,${LANGPAIRSTR}}} -# wget -O ${RELEASEDIR}/${LANGPAIRSTR}/${LANGPAIRSTR}.zip \ +# ${WGET} -O ${RELEASEDIR}/${LANGPAIRSTR}/${LANGPAIRSTR}.zip \ # ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${firstword ${call get-model-dist,${LANGPAIRSTR}}} # cd ${RELEASEDIR}/${LANGPAIRSTR} && unzip -n ${LANGPAIRSTR}.zip # rm -f ${RELEASEDIR}/${LANGPAIRSTR}/${LANGPAIRSTR}.zip diff --git a/lib/env.mk b/lib/env.mk index ec22f061..4435ee02 100644 --- a/lib/env.mk +++ b/lib/env.mk @@ -170,7 +170,7 @@ GZIP := ${shell which ${PIGZ} 2>/dev/null || echo gzip} GZCAT := ${GZIP} -cd ZCAT := gzip -cd UNIQ := ${SORT} -u - +WGET := wget -T 1 ## check that we have a GPU available diff --git a/lib/env/mahti.mk b/lib/env/mahti.mk index fcdc72d6..98ecb8d2 100644 --- a/lib/env/mahti.mk +++ b/lib/env/mahti.mk @@ -8,8 +8,8 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_JOBS=20 CPUJOB_HPC_MEM=128g -# CSCPROJECT = project_2002688 -CSCPROJECT = project_2005625 +CSCPROJECT = project_2002688 +# CSCPROJECT = project_2005625 WORKHOME = ${shell realpath ${PWD}/work} OPUSHOME = /projappl/nlpl/data/OPUS HPC_QUEUE = medium diff --git a/lib/preprocess.mk b/lib/preprocess.mk index 8e7d0525..a744e755 100644 --- a/lib/preprocess.mk +++ b/lib/preprocess.mk @@ -1,21 +1,23 @@ # -*-makefile-*- -## clean-corpus script parameters -## (for filtering subword-segmented bitexts) -## -## (TODO: should MIN_NTOKENS be 1?) -# MIN_NR_TOKENS = 0 -# MAX_NR_TOKENS = 250 -MIN_NR_TOKENS = 1 -MAX_NR_TOKENS = 500 -NR_TOKEN_RATIO = 2 -MAX_TOKEN_LENGTH = 100 +## moved to config.mk +## +# ## clean-corpus script parameters +# ## (for filtering subword-segmented bitexts) +# ## +# ## (TODO: should MIN_NTOKENS be 1?) +# # MIN_NR_TOKENS = 0 +# # MAX_NR_TOKENS = 250 +# MIN_NR_TOKENS = 1 +# MAX_NR_TOKENS = 500 +# NR_TOKEN_RATIO = 2 +# MAX_TOKEN_LENGTH = 100 -## default values in the original script: -## -# MAX_TOKEN_LENGTH = 1000 -# NR_TOKEN_RATIO = 9 +# ## default values in the original script: +# ## +# # MAX_TOKEN_LENGTH = 1000 +# # NR_TOKEN_RATIO = 9 ## compute some ratios and thresholds that could be useful for filtering training data diff --git a/lib/projects/celtic.mk b/lib/projects/celtic.mk index b1afd6d3..ae433ee6 100644 --- a/lib/projects/celtic.mk +++ b/lib/projects/celtic.mk @@ -62,7 +62,7 @@ welsh-data: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz: for c in CofnodYCynulliad Deddfwriaeth Meddalwedd; do \ - wget http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \ + ${WGET} http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \ tar -xzf $$c.tar.gz; \ $(TOKENIZER)/detokenizer.perl -l cy < $$c.cy |\ $(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.cy.gz; \ @@ -70,11 +70,11 @@ ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz: $(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.en.gz; \ rm -f $$c.tar.gz; \ done - wget http://techiaith.cymru/alinio/rhestr_geiriau.tsv + ${WGET} http://techiaith.cymru/alinio/rhestr_geiriau.tsv tail -n +16 rhestr_geiriau.tsv | cut -f1 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.en.gz tail -n +16 rhestr_geiriau.tsv | cut -f2 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.cy.gz rm -f rhestr_geiriau.tsv - wget http://techiaith.cymru/alinio/hunalign/cy-en.dic + ${WGET} http://techiaith.cymru/alinio/hunalign/cy-en.dic cut -f1 -d '@' < cy-en.dic | sed 's/ $$*//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.en.gz cut -f2 -d '@' < cy-en.dic | sed 's/^ *//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz @@ -84,7 +84,7 @@ CYMRU_BITEXTS = ${DATADIR}/${PRE}/CofnodYCynulliad.cy-en.clean.cy.gz \ ${DATADIR}/${PRE}/Meddalwedd.cy-en.clean.cy.gz ${CYMRU_BITEXTS}: ${DATADIR}/${PRE}/%.cy-en.clean.cy.gz: - wget http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@}) + ${WGET} http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@}) tar -xzf $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@}) $(TOKENIZER)/detokenizer.perl -l cy < $(patsubst %.cy-en.clean.cy.gz,%.cy,${notdir $@}) |\ $(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > $@ diff --git a/lib/projects/doclevel.mk b/lib/projects/doclevel.mk index 9c48bedb..3db83b29 100644 --- a/lib/projects/doclevel.mk +++ b/lib/projects/doclevel.mk @@ -89,7 +89,7 @@ ost-datasets: ${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz \ ## download the doc-level data set ${WORKHOME}/doclevel-MT-benchmark: - wget -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1 + ${WGET} -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1 unzip -d ${dir $@} $@.zip rm -f $@.zip diff --git a/lib/projects/elg.mk b/lib/projects/elg.mk index eb84b3e5..4a51b13d 100644 --- a/lib/projects/elg.mk +++ b/lib/projects/elg.mk @@ -73,14 +73,50 @@ elg-ukr-students: done +elg-test-tiny: + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt SRCLANGS=fin TRGLANGS=ukr test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt SRCLANGS=ukr TRGLANGS=fin test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=hun TRGLANGS=ukr test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=hun test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ron TRGLANGS=ukr test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=ron test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=swe TRGLANGS=ukr test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=swe test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=pol TRGLANGS=ukr test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=pol test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=lit TRGLANGS=ukr test-tiny11-student + ${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit test-tiny11-student + +elg-dist-tiny: + ${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=fin TRGLANGS=ukr release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=ukr TRGLANGS=fin release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=hun TRGLANGS=ukr release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=hun release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ron TRGLANGS=ukr release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=ron release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=swe TRGLANGS=ukr release-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=swe release-tiny11-student + +# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=pol TRGLANGS=ukr release-tiny11-student +# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=pol release-tiny11-student +# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=lit TRGLANGS=ukr release-tiny11-student +# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit release-tiny11-student + + ## tiny11 transformer model for finnish with pivot data (reuse student recipes) elg-fin2ukr-tiny11: - ${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=fin TRGLANGS=ukr MARIAN_EXTRA=--no-restore-corpus train-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=fin TRGLANGS=ukr train-tiny11-student elg-ukr2fin-tiny11: - ${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=ukr TRGLANGS=fin train-tiny11-student + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=fin train-tiny11-student + + + +elg-gmq2ukr-tiny11: + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student + ## tiny11 transformer model for finnish with pivot data (reuse student recipes) @@ -104,6 +140,31 @@ elg-ukr2swe-tiny11: ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=swe train-tiny11-student +elg-pol2ukr-tiny11: + ${MAKE} MARIAN_EARLY_STOPPING=20 CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=pol TRGLANGS=ukr train-tiny11-student + +elg-ukr2pol-tiny11: + ${MAKE} CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=pol train-tiny11-student + + +elg-lit2ukr-tiny11: + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=lit TRGLANGS=ukr train-tiny11-student + +elg-ukr2lit-tiny11: + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit train-tiny11-student + + + +elg-deu2ukr-tiny11: + ${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student + +elg-ukr2deu-tiny11: + ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=deu train-tiny11-student + + + + + diff --git a/lib/projects/sami.mk b/lib/projects/sami.mk index 30086789..db2f407f 100644 --- a/lib/projects/sami.mk +++ b/lib/projects/sami.mk @@ -38,28 +38,28 @@ GIELLATEKNO_SAMI_TM = fin-smn/tm/finsmn.tmx \ convert-sami-gloss: mkdir -p ${DATADIR}/${PRE} - wget ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8 cut -f1 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz cut -f2 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.smn.gz rm -f finsmn.utf8 - wget ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8 cut -f1 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-se.clean.fi.gz cut -f2 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-se.clean.se.gz rm -f finsme.utf8 - wget ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8 cut -f1 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.fi.gz cut -f2 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.sms.gz rm -f finsms.utf8 - wget ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8 cut -f1 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smn.clean.se.gz cut -f2 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smn.clean.smn.gz rm -f smesmn.utf8 - wget ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8 cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smj.clean.se.gz cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smj.clean.smj.gz rm -f glossary.utf8 - wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8 - wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8 cut -f1 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-se.clean.se cut -f2 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-se.clean.nb cut -f1 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-se.clean.se @@ -67,20 +67,20 @@ convert-sami-gloss: gzip -f ${DATADIR}/${PRE}/glossary.nb-se.clean.se gzip -f ${DATADIR}/${PRE}/glossary.nb-se.clean.nb rm -f smenob.utf8 termwiki.utf8 - wget ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8 cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-sma.clean.se.gz cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-sma.clean.sma.gz rm -f glossary.utf8 - wget ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8 cut -f1 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.nb.gz cut -f2 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.smj.gz rm -f nobsmj.utf8 - wget ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8 cut -f1 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-se.clean.nb.gz cut -f2 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-se.clean.se.gz rm -f nobsme.utf8 - wget ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8 - wget ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8 + ${WGET} ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8 cut -f1 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb cut -f2 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma cut -f1 termwiki.utf8 >>${DATADIR}/${PRE}/glossary.nb-sma.clean.sma @@ -136,7 +136,7 @@ merge-sami-data: ${GIELLATEKNO_SAMI_TM}: mkdir -p ${dir $@} - wget -O $@ ${GIELLATEKNO_TM_HOME}/$@ + ${WGET} -O $@ ${GIELLATEKNO_TM_HOME}/$@ ## name of the sami data sets diff --git a/lib/projects/simplify.mk b/lib/projects/simplify.mk index ec886b11..e080d8ef 100644 --- a/lib/projects/simplify.mk +++ b/lib/projects/simplify.mk @@ -97,7 +97,7 @@ SIMPLEWIKI_DATA2_DOC = document-aligned.v2 ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}: mkdir -p ${dir $@} - wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz + ${WGET} -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz tar -C ${dir $@} -xzf $@.tar.gz rm -f $@.tar.gz ${TOKENIZER}/detokenizer.perl -l en < $@/normal.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en1.raw @@ -112,7 +112,7 @@ ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}: mkdir -p ${dir $@} - wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz + ${WGET} -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz tar -C ${dir $@} -xzf $@.tar.gz rm -f $@.tar.gz cut -f3 $@/normal.aligned | tail -n +10001 |\ @@ -203,7 +203,7 @@ SIMPLEWIKI_LARGE = data-simplification/wikilarge ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}: mkdir -p ${dir $@} - wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL} + ${WGET} -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL} tar -C ${dir $@} -xf $@.tar.bz2 rm -f $@.tar.bz2 ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw diff --git a/lib/projects/tatoeba.mk b/lib/projects/tatoeba.mk index 72d18a83..6419e3d6 100644 --- a/lib/projects/tatoeba.mk +++ b/lib/projects/tatoeba.mk @@ -103,9 +103,11 @@ TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono ## (fetched from Tatoeba github) TATOEBA_LANGIDS_TRAINONLY = tatoeba/langids-train-only-${TATOEBA_VERSION}.txt -# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION} +# TATOEBA_RAWGIT_MASTER := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/master +# TATOEBA_RAWGIT_RELEASE := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/${TATOEBA_VERSION} ## data count files (file basename) @@ -114,13 +116,13 @@ TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSIO ## all released language pairs with test sets > 200 test pairs ## also extract all source languages that are available for a give target language ## and vice versa -TATOEBA_RELEASED_DATA = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1) +TATOEBA_RELEASED_DATA = $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1) TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}} TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}} ## extract language pairs for a specific subset TATOEBA_SUBSET = lower -TATOEBA_RELEASED_SUBSET = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1) +TATOEBA_RELEASED_SUBSET = $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1) TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}} TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}} @@ -770,7 +772,7 @@ all-tatoeba-langgroup-dist: TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt tatoeba-all-bt: - for b in ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep -v '.txt' | cut -f1 -d'/' | sort -u}; do \ + for b in ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_BT} | grep -v '.txt' | cut -f1 -d'/' | sort -u}; do \ s=`echo $$b | cut -f1 -d'-'`; \ t=`echo $$b | cut -f2 -d'-'`; \ echo "${MAKE} -C bt-tatoeba SRC=$$s TRG=$$t fetch-bt"; \ @@ -1256,7 +1258,7 @@ tatoeba-%-langtunealljobs: ## get the markdown page for a specific subset tatoeba-%.md: - wget -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@} + ${WGET} -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@} ## run all language pairs for a given subset @@ -1413,7 +1415,7 @@ tatoeba-multilingual-testsets: ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATO # @for s in ${SRCLANGS}; do \ # for t in ${TRGLANGS}; do \ # if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src ]; then \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ +# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ # ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \ # if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt ]; then \ # echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ @@ -1428,7 +1430,7 @@ tatoeba-multilingual-testsets: ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATO # cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ # > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \ # else \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ +# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \ # ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \ # if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt ]; then \ # echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ @@ -1467,7 +1469,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets. @mkdir -p ${TATOEBA_WORK}/${LANGPAIRSTR}/test @for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ - wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ + ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ ${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \ if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ @@ -1514,7 +1516,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets. done \ fi; \ else \ - wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ + ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ ${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \ if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ @@ -1577,7 +1579,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets. # ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets-with-subsets.done: # @for s in ${SRCLANGS}; do \ # for t in ${TRGLANGS}; do \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ +# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ # ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \ # if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ # echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ @@ -1619,7 +1621,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets. # done \ # fi; \ # else \ -# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ +# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ # ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \ # if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ # echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \ @@ -1701,7 +1703,7 @@ EVAL_TATOEBA_WORKDIR = ${EVAL_TATOEBA_WORKHOME}/$(dir ${RELEASED_TATOEBA_MODEL evaluate-released-tatoeba-model: mkdir -p ${EVAL_TATOEBA_WORKDIR} - wget -O ${EVAL_TATOEBA_WORKHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL} + ${WGET} -O ${EVAL_TATOEBA_WORKHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL} cd ${EVAL_TATOEBA_WORKDIR} && unzip -o $(notdir ${RELEASED_TATOEBA_MODEL}) ${MAKE} TATOEBA_WORK=${EVAL_TATOEBA_WORKHOME} \ DECODER_CONFIG=${EVAL_TATOEBA_WORKDIR}decoder.yml \ @@ -1889,18 +1891,18 @@ print-skiplangids: tatoeba/langids-train-only-${TATOEBA_VERSION}.txt: mkdir -p ${dir $@} - wget -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt + ${WGET} -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt ## monolingual data from Tatoeba challenge (wiki data) ${TATOEBA_MONO}/%.labels: mkdir -p $@.d # the old URL without versioning: - -wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar + -${WGET} -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar -tar -C $@.d -xf $@.d/mono.tar rm -f $@.d/mono.tar # the new URLs with versioning: - -wget -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar + -${WGET} -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar -tar -C $@.d -xf $@.d/mono.tar rm -f $@.d/mono.tar find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@ @@ -1933,7 +1935,7 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} %/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz: @mkdir -p $@.d - -wget -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar + -${WGET} -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar -tar -C $@.d -xf $@.d/train.tar @rm -f $@.d/train.tar @if [ -e $@.d/${TATOEBA_TMPDATADIR}/test.src ]; then \ diff --git a/lib/tasks/tatoeba/data.mk b/lib/tasks/tatoeba/data.mk index 7c6c1c53..5beb57c7 100644 --- a/lib/tasks/tatoeba/data.mk +++ b/lib/tasks/tatoeba/data.mk @@ -17,7 +17,7 @@ ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-languages.%: ${WORKHOME}/${LANGPAIRSTR}/${ ## a file with all released data sets in the current Tatoeba TC release ${RELEASED_TATOEBA_DATA_FILE}: - wget -O $@ ${RELEASED_TATOEBA_DATA_URL} + ${WGET} -O $@ ${RELEASED_TATOEBA_DATA_URL} ## don't delete intermediate label files @@ -168,18 +168,18 @@ print-skiplangids: tatoeba/langids-train-only-${TATOEBA_VERSION}.txt: mkdir -p ${dir $@} - wget -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt + ${WGET} -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt ## monolingual data from Tatoeba challenge (wiki data) ${TATOEBA_MONO}/%.labels: mkdir -p $@.d # the old URL without versioning: - -wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar + -${WGET} -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar -tar -C $@.d -xf $@.d/mono.tar rm -f $@.d/mono.tar # the new URLs with versioning: - -wget -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar + -${WGET} -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar -tar -C $@.d -xf $@.d/mono.tar rm -f $@.d/mono.tar find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@ @@ -295,7 +295,7 @@ endif %.gz.d/data.fetched: @echo ".... fetch data (${LANGPAIR}.tar)" @mkdir -p ${dir $@} - -wget -q -O ${dir $@}train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar + -${WGET} -q -O ${dir $@}train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar @if [ -e ${dir $@}train.tar ]; then \ tar -C ${dir $@} -xf ${dir $@}train.tar; \ rm -f ${dir $@}train.tar; \ @@ -428,7 +428,7 @@ ${MULTILING_TESTSETS_DONE}: @mkdir -p ${WORKHOME}/${LANGPAIRSTR}/test @for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ - wget -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ + ${WGET} -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ ${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \ if [ -s ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ cat ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ @@ -475,7 +475,7 @@ ${MULTILING_TESTSETS_DONE}: done \ fi; \ else \ - wget -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ + ${WGET} -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ ${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \ if [ -s ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ cat ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ diff --git a/lib/tasks/tatoeba/misc.mk b/lib/tasks/tatoeba/misc.mk index 3a950a8b..96a91086 100644 --- a/lib/tasks/tatoeba/misc.mk +++ b/lib/tasks/tatoeba/misc.mk @@ -88,7 +88,7 @@ EVAL_WORKHOMEDIR = ${EVAL_WORKHOMEHOME}/$(dir ${RELEASED_TATOEBA_MODEL}) evaluate-released-tatoeba-model: mkdir -p ${EVAL_WORKHOMEDIR} - wget -O ${EVAL_WORKHOMEHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL} + ${WGET} -O ${EVAL_WORKHOMEHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL} cd ${EVAL_WORKHOMEDIR} && unzip -o $(notdir ${RELEASED_TATOEBA_MODEL}) ${MAKE} WORKHOME=${EVAL_WORKHOMEHOME} \ DECODER_CONFIG=${EVAL_WORKHOMEDIR}decoder.yml \ diff --git a/pivoting/Makefile b/pivoting/Makefile index d2800b64..5c0ee3ec 100644 --- a/pivoting/Makefile +++ b/pivoting/Makefile @@ -246,7 +246,7 @@ print-excludes: ${OUTPUT_DIR}/${PIVOT_MODEL_NAME}/decoder.yml: ifneq (${PIVOT_MODEL_ZIP},) mkdir -p ${dir $@} -# wget -O ${dir $@}${PIVOT_MODEL_NAME}.zip ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${PIVOT_MODEL_ZIP} +# ${WGET} -O ${dir $@}${PIVOT_MODEL_NAME}.zip ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${PIVOT_MODEL_ZIP} cp ${PIVOT_MODEL_ZIP} ${dir $@} cd ${dir $@} && unzip *.zip rm -f ${dir $@}*.zip diff --git a/scores/fin-ukr/flores101-dev/bleu-scores.txt b/scores/fin-ukr/flores101-dev/bleu-scores.txt index 4072af91..713e6bcd 100644 --- a/scores/fin-ukr/flores101-dev/bleu-scores.txt +++ b/scores/fin-ukr/flores101-dev/bleu-scores.txt @@ -1,5 +1,6 @@ 19.0 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip 17.1 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip +16.6 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 11.3 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip 10.8 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip 6.1 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip diff --git a/scores/fin-ukr/flores101-dev/chrf-scores.txt b/scores/fin-ukr/flores101-dev/chrf-scores.txt index b663f2e1..8f393baf 100644 --- a/scores/fin-ukr/flores101-dev/chrf-scores.txt +++ b/scores/fin-ukr/flores101-dev/chrf-scores.txt @@ -1,4 +1,5 @@ 0.48750 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip +0.46455 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.46435 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip 0.388 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip 0.387 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/fin-ukr/flores101-devtest/bleu-scores.txt b/scores/fin-ukr/flores101-devtest/bleu-scores.txt index f4291df0..9d520342 100644 --- a/scores/fin-ukr/flores101-devtest/bleu-scores.txt +++ b/scores/fin-ukr/flores101-devtest/bleu-scores.txt @@ -1,5 +1,6 @@ 19.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip 17.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip +17.1 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 11.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip 11.5 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip 5.9 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip diff --git a/scores/fin-ukr/flores101-devtest/chrf-scores.txt b/scores/fin-ukr/flores101-devtest/chrf-scores.txt index b5a13249..b9c47906 100644 --- a/scores/fin-ukr/flores101-devtest/chrf-scores.txt +++ b/scores/fin-ukr/flores101-devtest/chrf-scores.txt @@ -1,4 +1,5 @@ 0.49562 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip +0.47210 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.46954 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip 0.390 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip 0.386 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip diff --git a/scores/hun-ukr/flores101-dev/bleu-scores.txt b/scores/hun-ukr/flores101-dev/bleu-scores.txt index 52df1c6a..d0f729e5 100644 --- a/scores/hun-ukr/flores101-dev/bleu-scores.txt +++ b/scores/hun-ukr/flores101-dev/bleu-scores.txt @@ -1,4 +1,5 @@ 19.8 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +17.7 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 12.0 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip 11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip 10.9 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/hun-ukr/flores101-dev/chrf-scores.txt b/scores/hun-ukr/flores101-dev/chrf-scores.txt index 08bbf2b8..230ac5fb 100644 --- a/scores/hun-ukr/flores101-dev/chrf-scores.txt +++ b/scores/hun-ukr/flores101-dev/chrf-scores.txt @@ -1,3 +1,4 @@ 0.48918 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +0.47575 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.397 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip 0.394 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/hun-ukr/flores101-devtest/bleu-scores.txt b/scores/hun-ukr/flores101-devtest/bleu-scores.txt index b0784c07..2fdc4916 100644 --- a/scores/hun-ukr/flores101-devtest/bleu-scores.txt +++ b/scores/hun-ukr/flores101-devtest/bleu-scores.txt @@ -1,4 +1,5 @@ 19.8 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +18.3 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip 11.2 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip 11.0 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/hun-ukr/flores101-devtest/chrf-scores.txt b/scores/hun-ukr/flores101-devtest/chrf-scores.txt index 54c81949..e49aad4b 100644 --- a/scores/hun-ukr/flores101-devtest/chrf-scores.txt +++ b/scores/hun-ukr/flores101-devtest/chrf-scores.txt @@ -1,2 +1,3 @@ 0.49490 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +0.48393 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.396 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/hun-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt b/scores/hun-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt index dd891aa0..45c3b82c 100644 --- a/scores/hun-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt +++ b/scores/hun-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt @@ -1,4 +1,5 @@ 40.7 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip +40.3 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 38.2 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip 38.0 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip 37.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip diff --git a/scores/hun-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt b/scores/hun-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt index 35774d0a..66025e46 100644 --- a/scores/hun-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt +++ b/scores/hun-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt @@ -1,3 +1,4 @@ +0.61575 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.61129 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip 0.611 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip 0.589 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/hun-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt b/scores/hun-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt index daf19759..1200d158 100644 --- a/scores/hun-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt +++ b/scores/hun-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt @@ -1,4 +1,5 @@ 40.5 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip +39.8 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 37.9 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip 37.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip 37.1 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip diff --git a/scores/hun-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt b/scores/hun-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt index 8e5a97c0..0f88c77c 100644 --- a/scores/hun-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt +++ b/scores/hun-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt @@ -1,3 +1,4 @@ +0.61193 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.60904 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip 0.609 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip 0.585 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/hun-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt b/scores/hun-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt index de6425af..660598f0 100644 --- a/scores/hun-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt +++ b/scores/hun-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt @@ -1,4 +1,5 @@ 40.9 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip +40.4 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 38.2 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip 38.1 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip 37.8 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip diff --git a/scores/hun-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt b/scores/hun-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt index fae218d8..146846a6 100644 --- a/scores/hun-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt +++ b/scores/hun-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt @@ -1,3 +1,4 @@ +0.61451 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.612 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip 0.61006 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip 0.589 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip diff --git a/scores/lit-ukr/flores101-dev/bleu-scores.txt b/scores/lit-ukr/flores101-dev/bleu-scores.txt index eee79e59..f122ea6f 100644 --- a/scores/lit-ukr/flores101-dev/bleu-scores.txt +++ b/scores/lit-ukr/flores101-dev/bleu-scores.txt @@ -1,2 +1,3 @@ +18.2 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 7.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 7.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/lit-ukr/flores101-dev/chrf-scores.txt b/scores/lit-ukr/flores101-dev/chrf-scores.txt index 8cd53b03..aa9a49fa 100644 --- a/scores/lit-ukr/flores101-dev/chrf-scores.txt +++ b/scores/lit-ukr/flores101-dev/chrf-scores.txt @@ -1,2 +1,3 @@ +0.48070 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 0.334 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.324 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/lit-ukr/flores101-devtest/bleu-scores.txt b/scores/lit-ukr/flores101-devtest/bleu-scores.txt index 0f1205f0..dc13d2d5 100644 --- a/scores/lit-ukr/flores101-devtest/bleu-scores.txt +++ b/scores/lit-ukr/flores101-devtest/bleu-scores.txt @@ -1,2 +1,3 @@ +18.5 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 7.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 6.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/lit-ukr/flores101-devtest/chrf-scores.txt b/scores/lit-ukr/flores101-devtest/chrf-scores.txt index 02ef2fe0..7571e036 100644 --- a/scores/lit-ukr/flores101-devtest/chrf-scores.txt +++ b/scores/lit-ukr/flores101-devtest/chrf-scores.txt @@ -1,2 +1,3 @@ +0.48759 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 0.332 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.322 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/pol-ukr/flores101-dev/bleu-scores.txt b/scores/pol-ukr/flores101-dev/bleu-scores.txt index 70706e9f..ff2e2d33 100644 --- a/scores/pol-ukr/flores101-dev/bleu-scores.txt +++ b/scores/pol-ukr/flores101-dev/bleu-scores.txt @@ -1,4 +1,5 @@ 17.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +15.6 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 11.9 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip 11.7 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-10-04.zip diff --git a/scores/pol-ukr/flores101-dev/chrf-scores.txt b/scores/pol-ukr/flores101-dev/chrf-scores.txt index 0d84ecb2..9ab69f14 100644 --- a/scores/pol-ukr/flores101-dev/chrf-scores.txt +++ b/scores/pol-ukr/flores101-dev/chrf-scores.txt @@ -1,4 +1,5 @@ 0.46995 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +0.46068 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.405 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 0.404 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-09-26.zip 0.402 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-10-04.zip diff --git a/scores/pol-ukr/flores101-devtest/bleu-scores.txt b/scores/pol-ukr/flores101-devtest/bleu-scores.txt index 1766f478..1aedad75 100644 --- a/scores/pol-ukr/flores101-devtest/bleu-scores.txt +++ b/scores/pol-ukr/flores101-devtest/bleu-scores.txt @@ -1,4 +1,5 @@ 17.5 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +15.7 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 12.3 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 12.0 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip 11.8 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-10-04.zip diff --git a/scores/pol-ukr/flores101-devtest/chrf-scores.txt b/scores/pol-ukr/flores101-devtest/chrf-scores.txt index 3f5452cc..cad19d1c 100644 --- a/scores/pol-ukr/flores101-devtest/chrf-scores.txt +++ b/scores/pol-ukr/flores101-devtest/chrf-scores.txt @@ -1,4 +1,5 @@ 0.47503 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +0.46388 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.411 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 0.408 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-09-26.zip 0.407 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip diff --git a/scores/pol-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt b/scores/pol-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt index 9733a2e7..b8ffb9f8 100644 --- a/scores/pol-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt +++ b/scores/pol-ukr/tatoeba-test-v2020-07-28/bleu-scores.txt @@ -1,4 +1,5 @@ 48.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +48.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 47.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip 46.7 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 46.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip diff --git a/scores/pol-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt b/scores/pol-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt index c75e07c4..175c123b 100644 --- a/scores/pol-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt +++ b/scores/pol-ukr/tatoeba-test-v2020-07-28/chrf-scores.txt @@ -1,3 +1,4 @@ +0.68443 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.67941 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip 0.664 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip 0.663 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip diff --git a/scores/pol-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt b/scores/pol-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt index 5458f4d4..8d3ceb34 100644 --- a/scores/pol-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt +++ b/scores/pol-ukr/tatoeba-test-v2021-03-30/bleu-scores.txt @@ -1,4 +1,5 @@ 48.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +48.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 47.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip 46.9 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 46.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip diff --git a/scores/pol-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt b/scores/pol-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt index 120e8309..f4c8374a 100644 --- a/scores/pol-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt +++ b/scores/pol-ukr/tatoeba-test-v2021-03-30/chrf-scores.txt @@ -1,3 +1,4 @@ +0.68463 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.67911 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip 0.664 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 0.663 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip diff --git a/scores/pol-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt b/scores/pol-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt index 081e4e40..05a249d7 100644 --- a/scores/pol-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt +++ b/scores/pol-ukr/tatoeba-test-v2021-08-07/bleu-scores.txt @@ -1,4 +1,5 @@ 48.4 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip +48.2 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 47.0 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip 46.8 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip 46.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip diff --git a/scores/pol-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt b/scores/pol-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt index 1f2b9232..f1a3794b 100644 --- a/scores/pol-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt +++ b/scores/pol-ukr/tatoeba-test-v2021-08-07/chrf-scores.txt @@ -1,3 +1,4 @@ +0.68493 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.67968 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip 0.664 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip 0.662 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip diff --git a/scores/ron-ukr/flores101-dev/bleu-scores.txt b/scores/ron-ukr/flores101-dev/bleu-scores.txt index d9a1ee20..351c4ba0 100644 --- a/scores/ron-ukr/flores101-dev/bleu-scores.txt +++ b/scores/ron-ukr/flores101-dev/bleu-scores.txt @@ -1,4 +1,5 @@ 23.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +21.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 13.5 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip 9.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 8.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ron-ukr/flores101-dev/chrf-scores.txt b/scores/ron-ukr/flores101-dev/chrf-scores.txt index 468883a0..d5d9c1a1 100644 --- a/scores/ron-ukr/flores101-dev/chrf-scores.txt +++ b/scores/ron-ukr/flores101-dev/chrf-scores.txt @@ -1,4 +1,5 @@ 0.52131 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +0.50866 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.423 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip 0.365 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.359 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ron-ukr/flores101-devtest/bleu-scores.txt b/scores/ron-ukr/flores101-devtest/bleu-scores.txt index 804782ac..a21bb3a7 100644 --- a/scores/ron-ukr/flores101-devtest/bleu-scores.txt +++ b/scores/ron-ukr/flores101-devtest/bleu-scores.txt @@ -1,4 +1,5 @@ 22.3 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +21.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 13.8 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip 9.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 8.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ron-ukr/flores101-devtest/chrf-scores.txt b/scores/ron-ukr/flores101-devtest/chrf-scores.txt index d0c4939e..c2f1ceb7 100644 --- a/scores/ron-ukr/flores101-devtest/chrf-scores.txt +++ b/scores/ron-ukr/flores101-devtest/chrf-scores.txt @@ -1,4 +1,5 @@ 0.52391 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip +0.51692 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.427 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip 0.368 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.359 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/swe-ukr/flores101-dev/bleu-scores.txt b/scores/swe-ukr/flores101-dev/bleu-scores.txt index 1f130ac6..5b44203e 100644 --- a/scores/swe-ukr/flores101-dev/bleu-scores.txt +++ b/scores/swe-ukr/flores101-dev/bleu-scores.txt @@ -1,3 +1,4 @@ 23.5 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip +21.2 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 9.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 8.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/swe-ukr/flores101-dev/chrf-scores.txt b/scores/swe-ukr/flores101-dev/chrf-scores.txt index 1466b91f..b021d1b0 100644 --- a/scores/swe-ukr/flores101-dev/chrf-scores.txt +++ b/scores/swe-ukr/flores101-dev/chrf-scores.txt @@ -1,3 +1,4 @@ 0.52149 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip +0.50707 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.359 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.348 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/swe-ukr/flores101-devtest/bleu-scores.txt b/scores/swe-ukr/flores101-devtest/bleu-scores.txt index bef97e74..d39c1f18 100644 --- a/scores/swe-ukr/flores101-devtest/bleu-scores.txt +++ b/scores/swe-ukr/flores101-devtest/bleu-scores.txt @@ -1,3 +1,4 @@ 24.8 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip +21.7 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 9.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 8.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/swe-ukr/flores101-devtest/chrf-scores.txt b/scores/swe-ukr/flores101-devtest/chrf-scores.txt index cb2efb24..0b2330bb 100644 --- a/scores/swe-ukr/flores101-devtest/chrf-scores.txt +++ b/scores/swe-ukr/flores101-devtest/chrf-scores.txt @@ -1,3 +1,4 @@ 0.53176 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip +0.50968 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.363 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.351 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-fin/flores101-dev/bleu-scores.txt b/scores/ukr-fin/flores101-dev/bleu-scores.txt index 1f5f5e90..720a221d 100644 --- a/scores/ukr-fin/flores101-dev/bleu-scores.txt +++ b/scores/ukr-fin/flores101-dev/bleu-scores.txt @@ -1,5 +1,6 @@ 18.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip 18.7 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip +16.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 11.9 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 6.0 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-21.zip diff --git a/scores/ukr-fin/flores101-dev/chrf-scores.txt b/scores/ukr-fin/flores101-dev/chrf-scores.txt index 7a3044f4..5423db02 100644 --- a/scores/ukr-fin/flores101-dev/chrf-scores.txt +++ b/scores/ukr-fin/flores101-dev/chrf-scores.txt @@ -1,5 +1,6 @@ 0.53196 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip 0.53075 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip +0.51350 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.448 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.439 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 0.354 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-21.zip diff --git a/scores/ukr-fin/flores101-devtest/bleu-scores.txt b/scores/ukr-fin/flores101-devtest/bleu-scores.txt index ba073203..7f42641f 100644 --- a/scores/ukr-fin/flores101-devtest/bleu-scores.txt +++ b/scores/ukr-fin/flores101-devtest/bleu-scores.txt @@ -1,5 +1,6 @@ 18.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip 18.0 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip +16.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 11.5 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 5.6 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip diff --git a/scores/ukr-fin/flores101-devtest/chrf-scores.txt b/scores/ukr-fin/flores101-devtest/chrf-scores.txt index 0057d219..4d17379a 100644 --- a/scores/ukr-fin/flores101-devtest/chrf-scores.txt +++ b/scores/ukr-fin/flores101-devtest/chrf-scores.txt @@ -1,5 +1,6 @@ 0.54119 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip 0.53440 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip +0.52174 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.456 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.447 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 0.357 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip diff --git a/scores/ukr-hun/flores101-dev/bleu-scores.txt b/scores/ukr-hun/flores101-dev/bleu-scores.txt index 13793354..952c632e 100644 --- a/scores/ukr-hun/flores101-dev/bleu-scores.txt +++ b/scores/ukr-hun/flores101-dev/bleu-scores.txt @@ -1,3 +1,4 @@ 21.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +19.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 14.2 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 13.5 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip diff --git a/scores/ukr-hun/flores101-dev/chrf-scores.txt b/scores/ukr-hun/flores101-dev/chrf-scores.txt index 840f5370..974580b0 100644 --- a/scores/ukr-hun/flores101-dev/chrf-scores.txt +++ b/scores/ukr-hun/flores101-dev/chrf-scores.txt @@ -1,4 +1,5 @@ 0.52022 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.50315 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.443 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.433 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 0.432 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip diff --git a/scores/ukr-hun/flores101-devtest/bleu-scores.txt b/scores/ukr-hun/flores101-devtest/bleu-scores.txt index 70ce3e71..016247cd 100644 --- a/scores/ukr-hun/flores101-devtest/bleu-scores.txt +++ b/scores/ukr-hun/flores101-devtest/bleu-scores.txt @@ -1,4 +1,5 @@ 20.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +18.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 14.3 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 13.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 13.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip diff --git a/scores/ukr-hun/flores101-devtest/chrf-scores.txt b/scores/ukr-hun/flores101-devtest/chrf-scores.txt index 43eebf37..b7a0c543 100644 --- a/scores/ukr-hun/flores101-devtest/chrf-scores.txt +++ b/scores/ukr-hun/flores101-devtest/chrf-scores.txt @@ -1,4 +1,5 @@ 0.51953 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.50442 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.452 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.441 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 0.438 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip diff --git a/scores/ukr-hun/tatoeba-test-v2020-07-28/bleu-scores.txt b/scores/ukr-hun/tatoeba-test-v2020-07-28/bleu-scores.txt index dcee9121..1db774a8 100644 --- a/scores/ukr-hun/tatoeba-test-v2020-07-28/bleu-scores.txt +++ b/scores/ukr-hun/tatoeba-test-v2020-07-28/bleu-scores.txt @@ -1,4 +1,5 @@ 43.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +42.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 41.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip 40.0 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 39.6 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip diff --git a/scores/ukr-hun/tatoeba-test-v2020-07-28/chrf-scores.txt b/scores/ukr-hun/tatoeba-test-v2020-07-28/chrf-scores.txt index 971665a1..10fa1a07 100644 --- a/scores/ukr-hun/tatoeba-test-v2020-07-28/chrf-scores.txt +++ b/scores/ukr-hun/tatoeba-test-v2020-07-28/chrf-scores.txt @@ -1,3 +1,4 @@ 0.67495 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.66780 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.646 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.645 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip diff --git a/scores/ukr-hun/tatoeba-test-v2021-03-30/bleu-scores.txt b/scores/ukr-hun/tatoeba-test-v2021-03-30/bleu-scores.txt index 72d58eeb..ff242dae 100644 --- a/scores/ukr-hun/tatoeba-test-v2021-03-30/bleu-scores.txt +++ b/scores/ukr-hun/tatoeba-test-v2021-03-30/bleu-scores.txt @@ -1,4 +1,5 @@ 43.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +42.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 40.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip 39.9 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 39.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip diff --git a/scores/ukr-hun/tatoeba-test-v2021-03-30/chrf-scores.txt b/scores/ukr-hun/tatoeba-test-v2021-03-30/chrf-scores.txt index 4d64fa31..5e359095 100644 --- a/scores/ukr-hun/tatoeba-test-v2021-03-30/chrf-scores.txt +++ b/scores/ukr-hun/tatoeba-test-v2021-03-30/chrf-scores.txt @@ -1,4 +1,5 @@ 0.67383 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.66714 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.645 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.643 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip 0.642 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip diff --git a/scores/ukr-hun/tatoeba-test-v2021-08-07/bleu-scores.txt b/scores/ukr-hun/tatoeba-test-v2021-08-07/bleu-scores.txt index 4202bdbc..6fafe041 100644 --- a/scores/ukr-hun/tatoeba-test-v2021-08-07/bleu-scores.txt +++ b/scores/ukr-hun/tatoeba-test-v2021-08-07/bleu-scores.txt @@ -1,4 +1,5 @@ 44.0 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +42.5 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 41.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip 40.1 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 39.8 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip diff --git a/scores/ukr-hun/tatoeba-test-v2021-08-07/chrf-scores.txt b/scores/ukr-hun/tatoeba-test-v2021-08-07/chrf-scores.txt index 96ba4557..9f51b5c5 100644 --- a/scores/ukr-hun/tatoeba-test-v2021-08-07/chrf-scores.txt +++ b/scores/ukr-hun/tatoeba-test-v2021-08-07/chrf-scores.txt @@ -1,3 +1,4 @@ 0.67544 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.66840 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.647 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip 0.646 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip diff --git a/scores/ukr-lit/flores101-dev/bleu-scores.txt b/scores/ukr-lit/flores101-dev/bleu-scores.txt index 095f9629..c8d51805 100644 --- a/scores/ukr-lit/flores101-dev/bleu-scores.txt +++ b/scores/ukr-lit/flores101-dev/bleu-scores.txt @@ -1,2 +1,3 @@ +20.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 7.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 7.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-lit/flores101-dev/chrf-scores.txt b/scores/ukr-lit/flores101-dev/chrf-scores.txt index bf6a5ba6..134e953d 100644 --- a/scores/ukr-lit/flores101-dev/chrf-scores.txt +++ b/scores/ukr-lit/flores101-dev/chrf-scores.txt @@ -1,2 +1,3 @@ +0.52788 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 0.367 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.358 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-lit/flores101-devtest/bleu-scores.txt b/scores/ukr-lit/flores101-devtest/bleu-scores.txt index 8f826ef6..b367d260 100644 --- a/scores/ukr-lit/flores101-devtest/bleu-scores.txt +++ b/scores/ukr-lit/flores101-devtest/bleu-scores.txt @@ -1,2 +1,3 @@ +21.0 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 8.0 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 7.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-lit/flores101-devtest/chrf-scores.txt b/scores/ukr-lit/flores101-devtest/chrf-scores.txt index c039338d..143040e8 100644 --- a/scores/ukr-lit/flores101-devtest/chrf-scores.txt +++ b/scores/ukr-lit/flores101-devtest/chrf-scores.txt @@ -1,2 +1,3 @@ +0.53907 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip 0.372 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.362 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-ron/flores101-dev/bleu-scores.txt b/scores/ukr-ron/flores101-dev/bleu-scores.txt index b8a1b3a7..1ef7552e 100644 --- a/scores/ukr-ron/flores101-dev/bleu-scores.txt +++ b/scores/ukr-ron/flores101-dev/bleu-scores.txt @@ -1,4 +1,5 @@ 27.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip +27.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 15.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip 9.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 9.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-ron/flores101-dev/chrf-scores.txt b/scores/ukr-ron/flores101-dev/chrf-scores.txt index 19989f05..8166f836 100644 --- a/scores/ukr-ron/flores101-dev/chrf-scores.txt +++ b/scores/ukr-ron/flores101-dev/chrf-scores.txt @@ -1,4 +1,5 @@ 0.55091 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.54743 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.448 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip 0.380 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.371 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-ron/flores101-devtest/bleu-scores.txt b/scores/ukr-ron/flores101-devtest/bleu-scores.txt index 46bfba42..3229171d 100644 --- a/scores/ukr-ron/flores101-devtest/bleu-scores.txt +++ b/scores/ukr-ron/flores101-devtest/bleu-scores.txt @@ -1,4 +1,5 @@ 27.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip +26.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 15.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip 9.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 9.3 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-ron/flores101-devtest/chrf-scores.txt b/scores/ukr-ron/flores101-devtest/chrf-scores.txt index 5ca0e7bc..7a3509a3 100644 --- a/scores/ukr-ron/flores101-devtest/chrf-scores.txt +++ b/scores/ukr-ron/flores101-devtest/chrf-scores.txt @@ -1,4 +1,5 @@ 0.55343 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip +0.54673 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.449 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip 0.379 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.369 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-swe/flores101-dev/bleu-scores.txt b/scores/ukr-swe/flores101-dev/bleu-scores.txt index c25b749b..47cd6b72 100644 --- a/scores/ukr-swe/flores101-dev/bleu-scores.txt +++ b/scores/ukr-swe/flores101-dev/bleu-scores.txt @@ -1,3 +1,4 @@ 28.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip +25.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 13.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 12.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-swe/flores101-dev/chrf-scores.txt b/scores/ukr-swe/flores101-dev/chrf-scores.txt index e904636a..10fc2c7e 100644 --- a/scores/ukr-swe/flores101-dev/chrf-scores.txt +++ b/scores/ukr-swe/flores101-dev/chrf-scores.txt @@ -1,3 +1,4 @@ 0.57272 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip +0.55350 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.411 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.401 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-swe/flores101-devtest/bleu-scores.txt b/scores/ukr-swe/flores101-devtest/bleu-scores.txt index 1837318a..8ed244b9 100644 --- a/scores/ukr-swe/flores101-devtest/bleu-scores.txt +++ b/scores/ukr-swe/flores101-devtest/bleu-scores.txt @@ -1,3 +1,4 @@ 28.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip +25.5 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 12.3 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/scores/ukr-swe/flores101-devtest/chrf-scores.txt b/scores/ukr-swe/flores101-devtest/chrf-scores.txt index ad310c75..a011dcd9 100644 --- a/scores/ukr-swe/flores101-devtest/chrf-scores.txt +++ b/scores/ukr-swe/flores101-devtest/chrf-scores.txt @@ -1,3 +1,4 @@ 0.57231 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip +0.55204 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip 0.406 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip 0.396 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip diff --git a/tatoeba/Makefile b/tatoeba/Makefile index a577d500..9cad9232 100644 --- a/tatoeba/Makefile +++ b/tatoeba/Makefile @@ -121,7 +121,6 @@ include ${REPOHOME}lib/projects/distill.mk include ${REPOHOME}lib/projects/elg.mk - .PHONY: all all: ${MAKE} tatoeba-prepare diff --git a/tatoeba/back-translate/Makefile b/tatoeba/back-translate/Makefile index 55d12ba3..b1cfbd39 100644 --- a/tatoeba/back-translate/Makefile +++ b/tatoeba/back-translate/Makefile @@ -23,8 +23,8 @@ TRG = eng TATOEBA_RELEASE = v2020-07-28 TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-${TATOEBA_RELEASE} TATOEBA_WIKI_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled -# TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master -TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw +TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +# TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt TATOEBA_RELEASED_ALL = ${TATOEBA_GITRAW}/models/released-model-results-all.txt TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt @@ -60,13 +60,13 @@ PWD := $(shell pwd) # MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar -MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4} +MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4} MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} -MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} +MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} ifneq (${MULTI_TARGET_MODEL},0) - TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} + TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} endif @@ -78,11 +78,11 @@ TATOEBA_MACRO_LANGS = hbs nor msa ## target languages of reliable models for current source language ## reliable is defined as BLEU scores above 20.0 ## -TATOEBA_RELIABLE_TRG_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \ +TATOEBA_RELIABLE_TRG_BLEU := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \ egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f2 -d-} ## alternative: chr-F2 >= 0.4 -TATOEBA_RELIABLE_TRG_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \ +TATOEBA_RELIABLE_TRG_CHRF := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \ egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f2 -d-} ## accept both @@ -95,12 +95,12 @@ TATOEBA_RELIABLE_TRG = $(filter-out ${TATOEBA_MACRO_LANGS},$(sort ${TATOEBA_RELI ##################################################################################### ## all "reliable" released tanslation models -# TATOEBA_AVAILABLE_NMT := ${shell wget -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u} +# TATOEBA_AVAILABLE_NMT := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u} -TATOEBA_RELIABLE_SRC_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \ +TATOEBA_RELIABLE_SRC_BLEU := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \ egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f1 -d-} -TATOEBA_RELIABLE_SRC_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \ +TATOEBA_RELIABLE_SRC_CHRF := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \ egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f1 -d-} TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SRC_CHRF}) @@ -108,7 +108,7 @@ TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SR ## TODO: is it OK to turn zho into cmn? ## NOTE: also needs to fix the grep pattern in recipe for ${WIKI_DIR}/${SRC} !!!! -TATOEBA_WIKILANGS := ${shell wget -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \ +TATOEBA_WIKILANGS := ${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \ cut -f2 | sed 's/zho/cmn/' | sed 's/nor.*/nob/' | sort -u } TATOEBA_TRANSLATABLE_WIKILANGS := ${filter ${TATOEBA_RELIABLE_SRC},${TATOEBA_WIKILANGS}} @@ -129,7 +129,7 @@ print-wikilangs: ### OBSOLETE?? ## languages of released wikis -RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \ +RELEASED_WIKIS := $(patsubst %.tar,%,${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \ grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'}) ## reverse list @@ -244,21 +244,21 @@ src2all: -RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}} -RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'} +RELEASED_BT_ALL := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_BT}} +RELEASED_BT := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'} fetch-bt: for d in ${RELEASED_BT}; do \ echo "fetch $$d"; \ mkdir -p `dirname $$d`; \ - wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ + ${WGET} -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ done fetch-all-bt: for d in ${RELEASED_BT_ALL}; do \ echo "fetch $$d"; \ mkdir -p `dirname $$d`; \ - wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ + ${WGET} -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ done @@ -413,7 +413,7 @@ print-modelinfo: ${LANGPAIR}/${MODELNAME}/decoder.yml: ifneq (${MODELZIP},) mkdir -p ${dir $@} - wget -O ${dir $@}/model.zip ${MODELZIP} + ${WGET} -O ${dir $@}/model.zip ${MODELZIP} cd ${dir $@} && unzip model.zip rm -f ${dir $@}/model.zip mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh @@ -451,7 +451,7 @@ ${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}/.done # fetch ${WIKI_DIR}/${SRC}/data: mkdir -p ${dir $@} - wget -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar + ${WGET} -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar tar -C ${dir $@} -xf $@.tar rm -f $@.tar @@ -486,7 +486,7 @@ ${WIKI_DIR}/${SRC}/.done: # ${WIKI_DIR}/${SRC}: # mkdir -p $@ -# wget -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar +# ${WGET} -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar # tar -C ${dir $@} -xf $@.tar # if [ -d ${WIKI_DIR}/data/${SRC} ]; then \ # mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\ diff --git a/tatoeba/eval/Makefile b/tatoeba/eval/Makefile index 0b791878..e925541f 100644 --- a/tatoeba/eval/Makefile +++ b/tatoeba/eval/Makefile @@ -20,7 +20,7 @@ GPUJOB_HPC_MEM = 20g MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models -MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'} +MODEL_DISTS := ${shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'} MODEL_DIST = ${firstword ${MODEL_DISTS}} MODEL = ${MODEL_DIST:.zip=} MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}} @@ -151,7 +151,7 @@ fetch: ${WORK_DIR}/model/decoder.yml ${WORK_DIR}/model/decoder.yml: mkdir -p ${dir $@} - wget -q -O ${dir $@}model.zip ${MODEL_URL} + ${WGET} -q -O ${dir $@}model.zip ${MODEL_URL} unzip -d ${dir $@} ${dir $@}model.zip ## fix an old problem with the pre-process script mv ${dir $@}preprocess.sh ${dir $@}preprocess-old.sh diff --git a/tatoeba/forward-translate/Makefile b/tatoeba/forward-translate/Makefile index a170238b..44fc4f34 100644 --- a/tatoeba/forward-translate/Makefile +++ b/tatoeba/forward-translate/Makefile @@ -70,16 +70,16 @@ PWD := $(shell pwd) # MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar -MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4} +MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4} MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} -MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} +MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} ifneq (${MULTI_TARGET_MODEL},0) - TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<' | head -1} + TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<' | head -1} endif -RELEASED_BITEXTS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \ +RELEASED_BITEXTS := $(patsubst %.tar,%,${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \ grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'}) RELEASED_BITEXTS_REV = ${shell (for d in ${RELEASED_BITEXTS}; do echo $$d; done) | tac} @@ -154,11 +154,11 @@ print-modelname: ##------------------------------------------- REV_LANGPAIR = ${TRG}-${SRC} -REV_MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4} +REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4} REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}} REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}} -REV_MULTI_TARGET_MODEL := ${shell wget -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l} +REV_MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l} ifeq (${REV_MULTI_TARGET_MODEL},1) REV_SRC_PREPROCESS_ARGS = ${TRG} ${SRC} ${REV_LANGPAIR}/${REV_MODELNAME}/source.spm REV_TRG_PREPROCESS_ARGS = ${SRC} ${TRG} ${REV_LANGPAIR}/${REV_MODELNAME}/target.spm noflags @@ -267,7 +267,7 @@ extract-rawbest-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.rawbest$ ${LANGPAIR}/${MODELNAME}/decoder.yml: ifneq (${MODELZIP},) mkdir -p ${dir $@} - wget -O ${dir $@}/model.zip ${MODELZIP} + ${WGET} -O ${dir $@}/model.zip ${MODELZIP} cd ${dir $@} && unzip model.zip rm -f ${dir $@}/model.zip mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh diff --git a/tatoeba/pivoting/Makefile b/tatoeba/pivoting/Makefile index 274baba3..2daa70e8 100644 --- a/tatoeba/pivoting/Makefile +++ b/tatoeba/pivoting/Makefile @@ -30,8 +30,8 @@ MARIAN_WORKSPACE=12000 TATOEBA_VERSION ?= v2021-08-07 TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION}) -# TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master -TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw +# TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw +TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models @@ -61,14 +61,14 @@ OUTPUT_DIR ?= ${NEW_LANGPAIR} # MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar -MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4} +MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4} MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} MODELDIR = ${OUTPUT_DIR}/${TRANSLATE_LANGPAIR}/${MODELNAME} -MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} +MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} ifneq (${MULTI_TARGET_MODEL},0) - TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} + TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'} endif @@ -131,7 +131,7 @@ print-modelinfo: ${MODELDIR}/decoder.yml: ifneq (${MODELZIP},) mkdir -p ${dir $@} - wget -O ${dir $@}/model.zip ${MODELZIP} + ${WGET} -O ${dir $@}/model.zip ${MODELZIP} cd ${dir $@} && unzip model.zip rm -f ${dir $@}/model.zip mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh @@ -203,32 +203,55 @@ ifneq (${MODELZIP},) gzip -c > ${PWD}/$@ endif + check-latest: @if [ -d ${OUTPUT_DIR}/latest ]; then \ + for T in `ls ${OUTPUT_DIR}/latest/*.${TRG}.gz`; do \ + S=`echo $$T | sed 's/.${TRG}.gz/.${SRC}.gz/'`; \ + if [ ! -e $$S ]; then \ + echo "$$S does not exist!"; \ + fi \ + done; \ for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \ T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ - a=`${GZCAT} $$S | wc -l`; \ - b=`${GZCAT} $$T | wc -l`; \ - if [ $$a != $$b ]; then \ - echo "$$a != $$b $$S $$T"; \ + if [ ! -e $$T ]; then \ + echo "$$T does not exist!"; \ else \ - echo "$$a $$S $$T"; \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + else \ + echo "$$a $$S $$T"; \ + fi \ fi \ - done \ + done; \ fi remove-incomplete-latest: @echo "check ${OUTPUT_DIR}" @mkdir -p ${OUTPUT_DIR}/incomplete/latest @if [ -d ${OUTPUT_DIR}/latest ]; then \ + for T in `ls ${OUTPUT_DIR}/latest/*.${TRG}.gz`; do \ + S=`echo $$T | sed 's/.${TRG}.gz/.${SRC}.gz/'`; \ + if [ ! -e $$S ]; then \ + echo "$$S does not exist!"; \ + mv $$T ${OUTPUT_DIR}/incomplete/latest/; \ + fi \ + done; \ for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \ T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \ - a=`${GZCAT} $$S | wc -l`; \ - b=`${GZCAT} $$T | wc -l`; \ - if [ $$a != $$b ]; then \ - echo "$$a != $$b $$S $$T"; \ + if [ ! -e $$T ]; then \ + echo "$$T does not exist!"; \ mv $$S ${OUTPUT_DIR}/incomplete/latest/; \ - mv $$T ${OUTPUT_DIR}/incomplete/latest/; \ + else \ + a=`${GZCAT} $$S | wc -l`; \ + b=`${GZCAT} $$T | wc -l`; \ + if [ $$a != $$b ]; then \ + echo "$$a != $$b $$S $$T"; \ + mv $$S ${OUTPUT_DIR}/incomplete/latest/; \ + mv $$T ${OUTPUT_DIR}/incomplete/latest/; \ + fi \ fi \ done \ fi