elg project stuff and changes done on mahti

This commit is contained in:
Joerg Tiedemann 2022-03-17 21:02:11 +02:00
parent f163518422
commit dee0f6b951
85 changed files with 320 additions and 128 deletions

View File

@ -149,13 +149,13 @@ store-wiki:
fetch-wiki fetch:
mkdir -p wiki
wget -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
${WGET} -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
tar -C wiki -xf wiki/${SRC}.tar
rm -f wiki/${SRC}.tar
fetch-wikidoc:
mkdir -p wikidoc
wget -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
${WGET} -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
tar -C wikidoc -xf wikidoc/${SRC}.tar
rm -f wikidoc/${SRC}.tar
@ -463,7 +463,7 @@ giellatekno/${SRC}/corp.${SRC}.aa.gz:
gzip -f giellatekno/${SRC}/corp.${SRC}.*
victorio.uit.no/biggies/trunk/langs/${SRC}:
wget -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp
${WGET} -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp
giellatekno/se: giellatekno/sme
-cd giellatekno && ln -s sme se
@ -689,11 +689,11 @@ endif
## index of all downloadable files
index.html:
wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
${WGET} -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
## wiki in json format
${WIKI_JSON}:
wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
${WGET} -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}

View File

@ -76,7 +76,7 @@ fetch-data:
%.fetched:
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
cd $(dir $@); \
wget ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \
${WGET} ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \
tar -xf $(notdir $(@:.fetched=.tar)); \
rm -f $(notdir $(@:.fetched=.tar)); \
touch $(notdir $@); \
@ -93,7 +93,7 @@ fetch-data:
work-%/${LANGPAIRSTR}:
mkdir -p $(dir $@)
cd $(dir $@) && \
wget ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar
${WGET} ${ALLAS_STORAGE_URL}OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar
tar -C $(dir $@) -xf $(dir $@)${LANGPAIRSTR}.tar
rm -f $(dir $@)${LANGPAIRSTR}.tar
touch $@.fetched

View File

@ -73,6 +73,25 @@ MODELTYPES = transformer \
## clean-corpus script parameters
## (for filtering subword-segmented bitexts)
##
## (TODO: should MIN_NTOKENS be 1?)
# MIN_NR_TOKENS = 0
# MAX_NR_TOKENS = 250
MIN_NR_TOKENS = 1
MAX_NR_TOKENS = 500
NR_TOKEN_RATIO = 2
MAX_TOKEN_LENGTH = 100
## default values in the original script:
##
# MAX_TOKEN_LENGTH = 1000
# NR_TOKEN_RATIO = 9
## name of the model-specific configuration file
## NEW: make it more model specific
#
@ -234,7 +253,7 @@ OPUSREAD_ARGS =
## get available data from the OPUS-API
OPUSAPI = http://opus.nlpl.eu/opusapi/
OPUSAPI_WGET = wget -qq --no-check-certificate -O - ${OPUSAPI}?
OPUSAPI_WGET = ${WGET} -qq --no-check-certificate -O - ${OPUSAPI}?
get-opus-mono = ${shell ${OPUSAPI_WGET}source=${1}\&corpora=True | ${JQ} '.corpora[]' | tr '"' ' '}
get-opus-bitexts = ${shell ${OPUSAPI_WGET}source=${1}\&target=${2}\&corpora=True | ${JQ} '.corpora[]' | tr '"' ' '}
@ -816,7 +835,7 @@ endif
## TODO: do we still need this?
## --> see OPUSLANGS which is directly taken from the API
opus-langs.txt:
wget -O $@.tmp ${OPUSAPI}?languages=true
${WGET} -O $@.tmp ${OPUSAPI}?languages=true
grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@
rm -f $@.tmp
@ -824,7 +843,7 @@ opus-langs.txt:
## TODO: do we need this file?
opus-langpairs.txt:
for l in ${OPUS_LANGS}; do \
wget -O $@.tmp ${OPUSAPI}?source=$$l\&languages=true; \
${WGET} -O $@.tmp ${OPUSAPI}?source=$$l\&languages=true; \
grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@.tmp2; \
for t in `cat $@.tmp2`; do \
if [ $$t \< $$l ]; then \

View File

@ -97,6 +97,8 @@ TATOEBA_LANGIDS_TRAINONLY = tatoeba/langids-train-only-${TATOEBA_VERSION}.txt
# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION}
# TATOEBA_RAWGIT_MASTER := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/master
# TATOEBA_RAWGIT_RELEASE := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/${TATOEBA_VERSION}
## data count files (file basename)
@ -107,13 +109,13 @@ RELEASED_TATOEBA_DATA_FILE = tatoeba/released-bitexts-${TATOEBA_VERSION}.txt
## all released language pairs with test sets > 200 test pairs
## also extract all source languages that are available for a give target language
## and vice versa
TATOEBA_RELEASED_DATA := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_RELEASED_DATA := $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
## extract language pairs for a specific subset
TATOEBA_SUBSET := lower
TATOEBA_RELEASED_SUBSET := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_RELEASED_SUBSET := $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
@ -123,7 +125,7 @@ TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter
## all available language pairs
## (download the file once and keep it here to get the language pairs in the release)
TATOEBA_LANGPAIRS := ${shell if [ ! -e ${RELEASED_TATOEBA_DATA_FILE} ]; then \
wget -q -O ${RELEASED_TATOEBA_DATA_FILE} ${RELEASED_TATOEBA_DATA_URL}; \
${WGET} -q -O ${RELEASED_TATOEBA_DATA_FILE} ${RELEASED_TATOEBA_DATA_URL}; \
fi; \
tail -n +2 ${RELEASED_TATOEBA_DATA_FILE} | cut -f1 }

View File

@ -407,7 +407,7 @@ $(LOCAL_TRAIN_SRC).algtmp.d/%.alg: $(LOCAL_TRAIN_SRC).algtmp.d/% $(LOCAL_TRAIN_T
echo "============================================"; \
echo "fetch moses data from $$l"; \
echo "============================================"; \
wget -qq -O $@-$$c-${LANGPAIR}.zip $$l; \
${WGET} -qq -O $@-$$c-${LANGPAIR}.zip $$l; \
unzip -d ${dir $@} -n $@-$$c-${LANGPAIR}.zip; \
mv ${dir $@}$$c*.${LANGPAIR}.${SRCEXT} $@; \
mv ${dir $@}$$c*.${LANGPAIR}.${TRGEXT} ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
@ -486,6 +486,27 @@ ifeq (${USE_REST_DEVDATA},1)
${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \
fi
endif
######################################
# run another round of cleaning if
# CLEAN_CORPUS_TRAINING_DATA is set
# --> could be useful if there is
# noisy data in back-translations etc
######################################
ifeq (${CLEAN_CORPUS_TRAINING_DATA},1)
@echo ".... another cleanup of local training data"
@ln -s ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT}
@ln -s ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.${TRGEXT}
@$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
-ratio ${NR_TOKEN_RATIO} \
-max-word-length ${MAX_TOKEN_LENGTH} \
${LOCAL_TRAIN_SRC} $(SRCEXT) $(TRGEXT) \
${LOCAL_TRAIN_SRC}.clean \
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
@mv -f ${LOCAL_TRAIN_SRC}.clean,${SRCEXT} ${LOCAL_TRAIN_SRC}
@mv -f ${LOCAL_TRAIN_SRC}.clean,${TRGEXT} ${LOCAL_TRAIN_TRG}
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT}
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${TRGEXT}
endif
ifeq (${SHUFFLE_TRAINING_DATA},1)
@echo ".... shuffle complete training data"
@paste ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
@ -495,6 +516,7 @@ ifeq (${SHUFFLE_TRAINING_DATA},1)
endif
## everything is done in the target above
${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC}
@echo "done!"

View File

@ -44,7 +44,7 @@ MODEL_YML = ${patsubst %.npz,%.yml,${MODEL_FINAL}}
get-model-release = ${shell wget -qq -O - ${MODELINDEX} | grep '^${1}/.*-.*\.zip' | LANG=en_US.UTF-8 sort -r}
get-model-release = ${shell ${WGET} -qq -O - ${MODELINDEX} | grep '^${1}/.*-.*\.zip' | LANG=en_US.UTF-8 sort -r}
get-model-distro = ${shell echo ${wildcard ${1}/${2}/*.zip} | tr ' ' "\n" | LANG=en_US.UTF-8 sort -r}
@ -627,9 +627,9 @@ upload-models:
fetch-model:
mkdir -p ${RELEASEDIR}/${LANGPAIRSTR}
cd ${RELEASEDIR}/${LANGPAIRSTR} && \
wget ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${firstword ${call get-model-release,${LANGPAIRSTR}}}
${WGET} ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${firstword ${call get-model-release,${LANGPAIRSTR}}}
# wget -O ${RELEASEDIR}/${LANGPAIRSTR}/${LANGPAIRSTR}.zip \
# ${WGET} -O ${RELEASEDIR}/${LANGPAIRSTR}/${LANGPAIRSTR}.zip \
# ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${firstword ${call get-model-dist,${LANGPAIRSTR}}}
# cd ${RELEASEDIR}/${LANGPAIRSTR} && unzip -n ${LANGPAIRSTR}.zip
# rm -f ${RELEASEDIR}/${LANGPAIRSTR}/${LANGPAIRSTR}.zip

View File

@ -170,7 +170,7 @@ GZIP := ${shell which ${PIGZ} 2>/dev/null || echo gzip}
GZCAT := ${GZIP} -cd
ZCAT := gzip -cd
UNIQ := ${SORT} -u
WGET := wget -T 1
## check that we have a GPU available

4
lib/env/mahti.mk vendored
View File

@ -8,8 +8,8 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_JOBS=20 CPUJOB_HPC_MEM=128g
# CSCPROJECT = project_2002688
CSCPROJECT = project_2005625
CSCPROJECT = project_2002688
# CSCPROJECT = project_2005625
WORKHOME = ${shell realpath ${PWD}/work}
OPUSHOME = /projappl/nlpl/data/OPUS
HPC_QUEUE = medium

View File

@ -1,21 +1,23 @@
# -*-makefile-*-
## clean-corpus script parameters
## (for filtering subword-segmented bitexts)
##
## (TODO: should MIN_NTOKENS be 1?)
# MIN_NR_TOKENS = 0
# MAX_NR_TOKENS = 250
MIN_NR_TOKENS = 1
MAX_NR_TOKENS = 500
NR_TOKEN_RATIO = 2
MAX_TOKEN_LENGTH = 100
## moved to config.mk
##
# ## clean-corpus script parameters
# ## (for filtering subword-segmented bitexts)
# ##
# ## (TODO: should MIN_NTOKENS be 1?)
# # MIN_NR_TOKENS = 0
# # MAX_NR_TOKENS = 250
# MIN_NR_TOKENS = 1
# MAX_NR_TOKENS = 500
# NR_TOKEN_RATIO = 2
# MAX_TOKEN_LENGTH = 100
## default values in the original script:
##
# MAX_TOKEN_LENGTH = 1000
# NR_TOKEN_RATIO = 9
# ## default values in the original script:
# ##
# # MAX_TOKEN_LENGTH = 1000
# # NR_TOKEN_RATIO = 9
## compute some ratios and thresholds that could be useful for filtering training data

View File

@ -62,7 +62,7 @@ welsh-data: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
for c in CofnodYCynulliad Deddfwriaeth Meddalwedd; do \
wget http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \
${WGET} http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \
tar -xzf $$c.tar.gz; \
$(TOKENIZER)/detokenizer.perl -l cy < $$c.cy |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.cy.gz; \
@ -70,11 +70,11 @@ ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.en.gz; \
rm -f $$c.tar.gz; \
done
wget http://techiaith.cymru/alinio/rhestr_geiriau.tsv
${WGET} http://techiaith.cymru/alinio/rhestr_geiriau.tsv
tail -n +16 rhestr_geiriau.tsv | cut -f1 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.en.gz
tail -n +16 rhestr_geiriau.tsv | cut -f2 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.cy.gz
rm -f rhestr_geiriau.tsv
wget http://techiaith.cymru/alinio/hunalign/cy-en.dic
${WGET} http://techiaith.cymru/alinio/hunalign/cy-en.dic
cut -f1 -d '@' < cy-en.dic | sed 's/ $$*//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.en.gz
cut -f2 -d '@' < cy-en.dic | sed 's/^ *//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
@ -84,7 +84,7 @@ CYMRU_BITEXTS = ${DATADIR}/${PRE}/CofnodYCynulliad.cy-en.clean.cy.gz \
${DATADIR}/${PRE}/Meddalwedd.cy-en.clean.cy.gz
${CYMRU_BITEXTS}: ${DATADIR}/${PRE}/%.cy-en.clean.cy.gz:
wget http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
${WGET} http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
tar -xzf $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
$(TOKENIZER)/detokenizer.perl -l cy < $(patsubst %.cy-en.clean.cy.gz,%.cy,${notdir $@}) |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > $@

View File

@ -89,7 +89,7 @@ ost-datasets: ${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz \
## download the doc-level data set
${WORKHOME}/doclevel-MT-benchmark:
wget -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1
${WGET} -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1
unzip -d ${dir $@} $@.zip
rm -f $@.zip

View File

@ -73,14 +73,50 @@ elg-ukr-students:
done
elg-test-tiny:
${MAKE} EMAIL= STUDENT_DATA=pft-pbt SRCLANGS=fin TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt SRCLANGS=ukr TRGLANGS=fin test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=hun TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=hun test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ron TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=ron test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=swe TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=swe test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=pol TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=pol test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=lit TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit test-tiny11-student
elg-dist-tiny:
${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=fin TRGLANGS=ukr release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=ukr TRGLANGS=fin release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=hun TRGLANGS=ukr release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=hun release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ron TRGLANGS=ukr release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=ron release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=swe TRGLANGS=ukr release-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=swe release-tiny11-student
# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=pol TRGLANGS=ukr release-tiny11-student
# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=pol release-tiny11-student
# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=lit TRGLANGS=ukr release-tiny11-student
# ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit release-tiny11-student
## tiny11 transformer model for finnish with pivot data (reuse student recipes)
elg-fin2ukr-tiny11:
${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=fin TRGLANGS=ukr MARIAN_EXTRA=--no-restore-corpus train-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=fin TRGLANGS=ukr train-tiny11-student
elg-ukr2fin-tiny11:
${MAKE} STUDENT_DATA=pft-pbt SRCLANGS=ukr TRGLANGS=fin train-tiny11-student
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=fin train-tiny11-student
elg-gmq2ukr-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student
## tiny11 transformer model for finnish with pivot data (reuse student recipes)
@ -104,6 +140,31 @@ elg-ukr2swe-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=swe train-tiny11-student
elg-pol2ukr-tiny11:
${MAKE} MARIAN_EARLY_STOPPING=20 CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=pol TRGLANGS=ukr train-tiny11-student
elg-ukr2pol-tiny11:
${MAKE} CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=pol train-tiny11-student
elg-lit2ukr-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=lit TRGLANGS=ukr train-tiny11-student
elg-ukr2lit-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit train-tiny11-student
elg-deu2ukr-tiny11:
${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student
elg-ukr2deu-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=deu train-tiny11-student

View File

@ -38,28 +38,28 @@ GIELLATEKNO_SAMI_TM = fin-smn/tm/finsmn.tmx \
convert-sami-gloss:
mkdir -p ${DATADIR}/${PRE}
wget ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/fin-smn/glossary/finsmn.utf8
cut -f1 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.fi.gz
cut -f2 finsmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-smn.clean.smn.gz
rm -f finsmn.utf8
wget ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/fin-sme/glossary/finsme.utf8
cut -f1 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-se.clean.fi.gz
cut -f2 finsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-se.clean.se.gz
rm -f finsme.utf8
wget ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/fin-sms/glossary/finsms.utf8
cut -f1 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.fi.gz
cut -f2 finsms.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.fi-sms.clean.sms.gz
rm -f finsms.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/sme-smn/glossary/smesmn.utf8
cut -f1 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smn.clean.se.gz
cut -f2 smesmn.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smn.clean.smn.gz
rm -f smesmn.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/sme-smj/glossary/glossary.utf8
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smj.clean.se.gz
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-smj.clean.smj.gz
rm -f glossary.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/smenob.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/sme-nob/glossary/termwiki.utf8
cut -f1 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-se.clean.se
cut -f2 smenob.utf8 > ${DATADIR}/${PRE}/glossary.nb-se.clean.nb
cut -f1 termwiki.utf8 >> ${DATADIR}/${PRE}/glossary.nb-se.clean.se
@ -67,20 +67,20 @@ convert-sami-gloss:
gzip -f ${DATADIR}/${PRE}/glossary.nb-se.clean.se
gzip -f ${DATADIR}/${PRE}/glossary.nb-se.clean.nb
rm -f smenob.utf8 termwiki.utf8
wget ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/sme-sma/glossary/glossary.utf8
cut -f1 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-sma.clean.se.gz
cut -f2 glossary.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.se-sma.clean.sma.gz
rm -f glossary.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/nob-smj/glossary/nobsmj.utf8
cut -f1 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.nb.gz
cut -f2 nobsmj.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-smj.clean.smj.gz
rm -f nobsmj.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/nob-sme/glossary/nobsme.utf8
cut -f1 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-se.clean.nb.gz
cut -f2 nobsme.utf8 | gzip -c > ${DATADIR}/${PRE}/glossary.nb-se.clean.se.gz
rm -f nobsme.utf8
wget ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8
wget ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/nob-sma/glossary/nobsma.utf8
${WGET} ${GIELLATEKNO_TM_HOME}/sma-nob/glossary/termwiki.utf8
cut -f1 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.nb
cut -f2 nobsma.utf8 > ${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
cut -f1 termwiki.utf8 >>${DATADIR}/${PRE}/glossary.nb-sma.clean.sma
@ -136,7 +136,7 @@ merge-sami-data:
${GIELLATEKNO_SAMI_TM}:
mkdir -p ${dir $@}
wget -O $@ ${GIELLATEKNO_TM_HOME}/$@
${WGET} -O $@ ${GIELLATEKNO_TM_HOME}/$@
## name of the sami data sets

View File

@ -97,7 +97,7 @@ SIMPLEWIKI_DATA2_DOC = document-aligned.v2
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
mkdir -p ${dir $@}
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
${WGET} -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
tar -C ${dir $@} -xzf $@.tar.gz
rm -f $@.tar.gz
${TOKENIZER}/detokenizer.perl -l en < $@/normal.training.txt > ${DATADIR}/${PRE}/simplewiki_v1-training.en-en.en1.raw
@ -112,7 +112,7 @@ ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}:
mkdir -p ${dir $@}
wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz
${WGET} -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz
tar -C ${dir $@} -xzf $@.tar.gz
rm -f $@.tar.gz
cut -f3 $@/normal.aligned | tail -n +10001 |\
@ -203,7 +203,7 @@ SIMPLEWIKI_LARGE = data-simplification/wikilarge
${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}:
mkdir -p ${dir $@}
wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL}
${WGET} -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL}
tar -C ${dir $@} -xf $@.tar.bz2
rm -f $@.tar.bz2
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw

View File

@ -103,9 +103,11 @@ TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono
## (fetched from Tatoeba github)
TATOEBA_LANGIDS_TRAINONLY = tatoeba/langids-train-only-${TATOEBA_VERSION}.txt
# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION}
# TATOEBA_RAWGIT_MASTER := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/master
# TATOEBA_RAWGIT_RELEASE := https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw/${TATOEBA_VERSION}
## data count files (file basename)
@ -114,13 +116,13 @@ TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSIO
## all released language pairs with test sets > 200 test pairs
## also extract all source languages that are available for a give target language
## and vice versa
TATOEBA_RELEASED_DATA = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_RELEASED_DATA = $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
## extract language pairs for a specific subset
TATOEBA_SUBSET = lower
TATOEBA_RELEASED_SUBSET = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_RELEASED_SUBSET = $(shell ${WGET} -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
@ -770,7 +772,7 @@ all-tatoeba-langgroup-dist:
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
tatoeba-all-bt:
for b in ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep -v '.txt' | cut -f1 -d'/' | sort -u}; do \
for b in ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_BT} | grep -v '.txt' | cut -f1 -d'/' | sort -u}; do \
s=`echo $$b | cut -f1 -d'-'`; \
t=`echo $$b | cut -f2 -d'-'`; \
echo "${MAKE} -C bt-tatoeba SRC=$$s TRG=$$t fetch-bt"; \
@ -1256,7 +1258,7 @@ tatoeba-%-langtunealljobs:
## get the markdown page for a specific subset
tatoeba-%.md:
wget -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@}
${WGET} -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@}
## run all language pairs for a given subset
@ -1413,7 +1415,7 @@ tatoeba-multilingual-testsets: ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATO
# @for s in ${SRCLANGS}; do \
# for t in ${TRGLANGS}; do \
# if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.src ]; then \
# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \
# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \
# ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \
# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt ]; then \
# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \
@ -1428,7 +1430,7 @@ tatoeba-multilingual-testsets: ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATO
# cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \
# > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.trg; \
# else \
# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \
# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt \
# ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \
# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt ]; then \
# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \
@ -1467,7 +1469,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.
@mkdir -p ${TATOEBA_WORK}/${LANGPAIRSTR}/test
@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
@ -1514,7 +1516,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.
done \
fi; \
else \
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
@ -1577,7 +1579,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.
# ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets-with-subsets.done:
# @for s in ${SRCLANGS}; do \
# for t in ${TRGLANGS}; do \
# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
# ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \
# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \
@ -1619,7 +1621,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba${TATOEBA_VERSION_NOHYPHEN}-testsets.
# done \
# fi; \
# else \
# wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
# ${WGET} -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
# ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \
# if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
# echo "make ${TATOEBA_TESTSET}.$$s-$$t"; \
@ -1701,7 +1703,7 @@ EVAL_TATOEBA_WORKDIR = ${EVAL_TATOEBA_WORKHOME}/$(dir ${RELEASED_TATOEBA_MODEL
evaluate-released-tatoeba-model:
mkdir -p ${EVAL_TATOEBA_WORKDIR}
wget -O ${EVAL_TATOEBA_WORKHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL}
${WGET} -O ${EVAL_TATOEBA_WORKHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL}
cd ${EVAL_TATOEBA_WORKDIR} && unzip -o $(notdir ${RELEASED_TATOEBA_MODEL})
${MAKE} TATOEBA_WORK=${EVAL_TATOEBA_WORKHOME} \
DECODER_CONFIG=${EVAL_TATOEBA_WORKDIR}decoder.yml \
@ -1889,18 +1891,18 @@ print-skiplangids:
tatoeba/langids-train-only-${TATOEBA_VERSION}.txt:
mkdir -p ${dir $@}
wget -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt
${WGET} -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt
## monolingual data from Tatoeba challenge (wiki data)
${TATOEBA_MONO}/%.labels:
mkdir -p $@.d
# the old URL without versioning:
-wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
-${WGET} -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
-tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar
# the new URLs with versioning:
-wget -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar
-${WGET} -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar
-tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar
find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
@ -1933,7 +1935,7 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
%/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz:
@mkdir -p $@.d
-wget -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar
-${WGET} -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar
-tar -C $@.d -xf $@.d/train.tar
@rm -f $@.d/train.tar
@if [ -e $@.d/${TATOEBA_TMPDATADIR}/test.src ]; then \

View File

@ -17,7 +17,7 @@ ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-languages.%: ${WORKHOME}/${LANGPAIRSTR}/${
## a file with all released data sets in the current Tatoeba TC release
${RELEASED_TATOEBA_DATA_FILE}:
wget -O $@ ${RELEASED_TATOEBA_DATA_URL}
${WGET} -O $@ ${RELEASED_TATOEBA_DATA_URL}
## don't delete intermediate label files
@ -168,18 +168,18 @@ print-skiplangids:
tatoeba/langids-train-only-${TATOEBA_VERSION}.txt:
mkdir -p ${dir $@}
wget -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt
${WGET} -O $@ ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/langids-train-only.txt
## monolingual data from Tatoeba challenge (wiki data)
${TATOEBA_MONO}/%.labels:
mkdir -p $@.d
# the old URL without versioning:
-wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
-${WGET} -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
-tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar
# the new URLs with versioning:
-wget -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar
-${WGET} -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar
-tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar
find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
@ -295,7 +295,7 @@ endif
%.gz.d/data.fetched:
@echo ".... fetch data (${LANGPAIR}.tar)"
@mkdir -p ${dir $@}
-wget -q -O ${dir $@}train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar
-${WGET} -q -O ${dir $@}train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar
@if [ -e ${dir $@}train.tar ]; then \
tar -C ${dir $@} -xf ${dir $@}train.tar; \
rm -f ${dir $@}train.tar; \
@ -428,7 +428,7 @@ ${MULTILING_TESTSETS_DONE}:
@mkdir -p ${WORKHOME}/${LANGPAIRSTR}/test
@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
wget -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${WGET} -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \
if [ -s ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
cat ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
@ -475,7 +475,7 @@ ${MULTILING_TESTSETS_DONE}:
done \
fi; \
else \
wget -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${WGET} -q -O ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \
if [ -s ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
cat ${WORKHOME}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \

View File

@ -88,7 +88,7 @@ EVAL_WORKHOMEDIR = ${EVAL_WORKHOMEHOME}/$(dir ${RELEASED_TATOEBA_MODEL})
evaluate-released-tatoeba-model:
mkdir -p ${EVAL_WORKHOMEDIR}
wget -O ${EVAL_WORKHOMEHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL}
${WGET} -O ${EVAL_WORKHOMEHOME}/${RELEASED_TATOEBA_MODEL} ${RELEASED_TATOEBA_MODEL_URL}
cd ${EVAL_WORKHOMEDIR} && unzip -o $(notdir ${RELEASED_TATOEBA_MODEL})
${MAKE} WORKHOME=${EVAL_WORKHOMEHOME} \
DECODER_CONFIG=${EVAL_WORKHOMEDIR}decoder.yml \

View File

@ -246,7 +246,7 @@ print-excludes:
${OUTPUT_DIR}/${PIVOT_MODEL_NAME}/decoder.yml:
ifneq (${PIVOT_MODEL_ZIP},)
mkdir -p ${dir $@}
# wget -O ${dir $@}${PIVOT_MODEL_NAME}.zip ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${PIVOT_MODEL_ZIP}
# ${WGET} -O ${dir $@}${PIVOT_MODEL_NAME}.zip ${OBJECTSTORAGE}/${MODEL_CONTAINER}/${PIVOT_MODEL_ZIP}
cp ${PIVOT_MODEL_ZIP} ${dir $@}
cd ${dir $@} && unzip *.zip
rm -f ${dir $@}*.zip

View File

@ -1,5 +1,6 @@
19.0 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
17.1 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip
16.6 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
11.3 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip
10.8 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip
6.1 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip

View File

@ -1,4 +1,5 @@
0.48750 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
0.46455 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.46435 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip
0.388 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip
0.387 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,5 +1,6 @@
19.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
17.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip
17.1 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
11.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip
11.5 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip
5.9 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip

View File

@ -1,4 +1,5 @@
0.49562 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
0.47210 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-ukr/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.46954 https://object.pouta.csc.fi/Tatoeba-MT-models/fin-zle/opus4m+btTCv20210807-2022-01-19.zip
0.390 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip
0.386 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip

View File

@ -1,4 +1,5 @@
19.8 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
17.7 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
12.0 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip
10.9 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,3 +1,4 @@
0.48918 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.47575 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.397 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
0.394 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,4 +1,5 @@
19.8 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
18.3 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip
11.2 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
11.0 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,2 +1,3 @@
0.49490 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.48393 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.396 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,4 +1,5 @@
40.7 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
40.3 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
38.2 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
38.0 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip
37.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip

View File

@ -1,3 +1,4 @@
0.61575 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.61129 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.611 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
0.589 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,4 +1,5 @@
40.5 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
39.8 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
37.9 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
37.7 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip
37.1 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip

View File

@ -1,3 +1,4 @@
0.61193 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.60904 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.609 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
0.585 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,4 +1,5 @@
40.9 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
40.4 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
38.2 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip
38.1 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
37.8 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-zle/opus-2021-02-11.zip

View File

@ -1,3 +1,4 @@
0.61451 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.612 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opus-2021-02-19.zip
0.61006 https://object.pouta.csc.fi/Tatoeba-MT-models/hun-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.589 https://object.pouta.csc.fi/Tatoeba-MT-models/fiu-sla/opus-2021-02-16.zip

View File

@ -1,2 +1,3 @@
18.2 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
7.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
7.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,2 +1,3 @@
0.48070 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
0.334 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.324 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,2 +1,3 @@
18.5 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
7.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
6.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,2 +1,3 @@
0.48759 https://object.pouta.csc.fi/Tatoeba-MT-models/lit-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
0.332 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.322 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
17.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
15.6 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
11.9 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip
11.7 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-10-04.zip

View File

@ -1,4 +1,5 @@
0.46995 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.46068 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.405 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
0.404 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-09-26.zip
0.402 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-10-04.zip

View File

@ -1,4 +1,5 @@
17.5 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
15.7 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
12.3 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
12.0 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip
11.8 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-10-04.zip

View File

@ -1,4 +1,5 @@
0.47503 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.46388 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.411 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
0.408 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-09-26.zip
0.407 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
48.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
48.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
47.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip
46.7 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
46.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.68443 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.67941 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.664 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip
0.663 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
48.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
48.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
47.1 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip
46.9 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
46.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.68463 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.67911 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.664 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
0.663 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
48.4 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
48.2 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
47.0 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip
46.8 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-27.zip
46.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.68493 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.67968 https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-zle/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.664 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-sla/opus-2020-07-21.zip
0.662 https://object.pouta.csc.fi/Tatoeba-MT-models/pol-ukr/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
23.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
21.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
13.5 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip
9.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
8.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
0.52131 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.50866 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.423 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip
0.365 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.359 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
22.3 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
21.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
13.8 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip
9.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
8.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
0.52391 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+pbt_transformer-align_2022-03-08.zip
0.51692 https://object.pouta.csc.fi/Tatoeba-MT-models/ron-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.427 https://object.pouta.csc.fi/Tatoeba-MT-models/roa-zle/opus1m-2021-02-18.zip
0.368 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.359 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
23.5 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
21.2 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
9.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
8.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.52149 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
0.50707 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.359 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.348 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
24.8 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
21.7 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
9.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
8.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.53176 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+pbt_transformer-align_2022-03-07.zip
0.50968 https://object.pouta.csc.fi/Tatoeba-MT-models/swe-ukr/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.363 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.351 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,5 +1,6 @@
18.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip
18.7 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip
16.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
11.9 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
6.0 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-21.zip

View File

@ -1,5 +1,6 @@
0.53196 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip
0.53075 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.51350 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.448 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.439 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
0.354 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-21.zip

View File

@ -1,5 +1,6 @@
18.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip
18.0 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip
16.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
11.5 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
5.6 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip

View File

@ -1,5 +1,6 @@
0.54119 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pft_transformer-align_2022-03-07.zip
0.53440 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fin/opusTCv20210807+bt_transformer-big_2022-03-07.zip
0.52174 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-fin/opusTCv20210807+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.456 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.447 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
0.357 https://object.pouta.csc.fi/Tatoeba-MT-models/tatoeba-zero/opus-2020-06-19.zip

View File

@ -1,3 +1,4 @@
21.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
19.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
14.2 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
13.5 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
0.52022 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.50315 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.443 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.433 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
0.432 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
20.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
18.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
14.3 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
13.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
13.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
0.51953 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.50442 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.452 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.441 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
0.438 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
43.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
42.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
41.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip
40.0 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
39.6 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip

View File

@ -1,3 +1,4 @@
0.67495 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.66780 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.646 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.645 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
43.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
42.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
40.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip
39.9 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
39.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip

View File

@ -1,4 +1,5 @@
0.67383 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.66714 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.645 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.643 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip
0.642 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip

View File

@ -1,4 +1,5 @@
44.0 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
42.5 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
41.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opus-2021-02-18.zip
40.1 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
39.8 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip

View File

@ -1,3 +1,4 @@
0.67544 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.66840 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-hun/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.647 https://object.pouta.csc.fi/Tatoeba-MT-models/sla-fiu/opus-2021-02-10.zip
0.646 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-fiu/opus-2021-02-10.zip

View File

@ -1,2 +1,3 @@
20.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
7.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
7.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,2 +1,3 @@
0.52788 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
0.367 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.358 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,2 +1,3 @@
21.0 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
8.0 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
7.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,2 +1,3 @@
0.53907 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-lit/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-17.zip
0.372 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.362 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
27.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip
27.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
15.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip
9.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
9.1 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
0.55091 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.54743 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.448 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip
0.380 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.371 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
27.7 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip
26.8 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
15.3 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip
9.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
9.3 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,4 +1,5 @@
0.55343 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+pft_transformer-align_2022-03-08.zip
0.54673 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-ron/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.449 https://object.pouta.csc.fi/Tatoeba-MT-models/zle-roa/opus1m-2021-02-18.zip
0.379 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.369 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
28.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip
25.9 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
13.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
12.4 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.57272 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip
0.55350 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.411 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.401 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
28.2 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip
25.5 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
12.3 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
11.6 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -1,3 +1,4 @@
0.57231 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+pft_transformer-align_2022-03-07.zip
0.55204 https://object.pouta.csc.fi/Tatoeba-MT-models/ukr-swe/opusTCv20210807+bt+pbt+pft-sepvoc_transformer-tiny11-align_2022-03-16.zip
0.406 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-27.zip
0.396 https://object.pouta.csc.fi/Tatoeba-MT-models/ine-ine/opus-2020-07-21.zip

View File

@ -121,7 +121,6 @@ include ${REPOHOME}lib/projects/distill.mk
include ${REPOHOME}lib/projects/elg.mk
.PHONY: all
all:
${MAKE} tatoeba-prepare

View File

@ -23,8 +23,8 @@ TRG = eng
TATOEBA_RELEASE = v2020-07-28
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-${TATOEBA_RELEASE}
TATOEBA_WIKI_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
# TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
# TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt
TATOEBA_RELEASED_ALL = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
@ -60,13 +60,13 @@ PWD := $(shell pwd)
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_ALL} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
endif
@ -78,11 +78,11 @@ TATOEBA_MACRO_LANGS = hbs nor msa
## target languages of reliable models for current source language
## reliable is defined as BLEU scores above 20.0
##
TATOEBA_RELIABLE_TRG_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
TATOEBA_RELIABLE_TRG_BLEU := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f2 -d-}
## alternative: chr-F2 >= 0.4
TATOEBA_RELIABLE_TRG_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
TATOEBA_RELIABLE_TRG_CHRF := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${SRC}-' | \
egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f2 -d-}
## accept both
@ -95,12 +95,12 @@ TATOEBA_RELIABLE_TRG = $(filter-out ${TATOEBA_MACRO_LANGS},$(sort ${TATOEBA_RELI
#####################################################################################
## all "reliable" released tanslation models
# TATOEBA_AVAILABLE_NMT := ${shell wget -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u}
# TATOEBA_AVAILABLE_NMT := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u}
TATOEBA_RELIABLE_SRC_BLEU := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
TATOEBA_RELIABLE_SRC_BLEU := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
egrep '\s[2-9][0-9]\.' | cut -f1 | sort -u | cut -f1 -d-}
TATOEBA_RELIABLE_SRC_CHRF := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
TATOEBA_RELIABLE_SRC_CHRF := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep -- '-${TRG} ' | \
egrep '[a-z]\s0\.[4-9]' | cut -f1 | sort -u | cut -f1 -d-}
TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SRC_CHRF})
@ -108,7 +108,7 @@ TATOEBA_RELIABLE_SRC = $(sort ${TATOEBA_RELIABLE_SRC_BLEU} ${TATOEBA_RELIABLE_SR
## TODO: is it OK to turn zho into cmn?
## NOTE: also needs to fix the grep pattern in recipe for ${WIKI_DIR}/${SRC} !!!!
TATOEBA_WIKILANGS := ${shell wget -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \
TATOEBA_WIKILANGS := ${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/data/release/${TATOEBA_RELEASE}/wiki.langs.txt | \
cut -f2 | sed 's/zho/cmn/' | sed 's/nor.*/nob/' | sort -u }
TATOEBA_TRANSLATABLE_WIKILANGS := ${filter ${TATOEBA_RELIABLE_SRC},${TATOEBA_WIKILANGS}}
@ -129,7 +129,7 @@ print-wikilangs:
### OBSOLETE??
## languages of released wikis
RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
RELEASED_WIKIS := $(patsubst %.tar,%,${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
## reverse list
@ -244,21 +244,21 @@ src2all:
RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
RELEASED_BT_ALL := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
fetch-bt:
for d in ${RELEASED_BT}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
${WGET} -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
fetch-all-bt:
for d in ${RELEASED_BT_ALL}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
${WGET} -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
@ -413,7 +413,7 @@ print-modelinfo:
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
${WGET} -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
@ -451,7 +451,7 @@ ${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}/.done
# fetch
${WIKI_DIR}/${SRC}/data:
mkdir -p ${dir $@}
wget -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar
${WGET} -O $@.tar ${TATOEBA_STORAGE}/${shell iso639 -m -n ${SRC}}.tar
tar -C ${dir $@} -xf $@.tar
rm -f $@.tar
@ -486,7 +486,7 @@ ${WIKI_DIR}/${SRC}/.done:
# ${WIKI_DIR}/${SRC}:
# mkdir -p $@
# wget -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar
# ${WGET} -O $@.tar ${TATOEBA_WIKI_STORAGE}/${SRC}.tar
# tar -C ${dir $@} -xf $@.tar
# if [ -d ${WIKI_DIR}/data/${SRC} ]; then \
# mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\

View File

@ -20,7 +20,7 @@ GPUJOB_HPC_MEM = 20g
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'}
MODEL_DISTS := ${shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'}
MODEL_DIST = ${firstword ${MODEL_DISTS}}
MODEL = ${MODEL_DIST:.zip=}
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
@ -151,7 +151,7 @@ fetch: ${WORK_DIR}/model/decoder.yml
${WORK_DIR}/model/decoder.yml:
mkdir -p ${dir $@}
wget -q -O ${dir $@}model.zip ${MODEL_URL}
${WGET} -q -O ${dir $@}model.zip ${MODEL_URL}
unzip -d ${dir $@} ${dir $@}model.zip
## fix an old problem with the pre-process script
mv ${dir $@}preprocess.sh ${dir $@}preprocess-old.sh

View File

@ -70,16 +70,16 @@ PWD := $(shell pwd)
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<' | head -1}
TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<' | head -1}
endif
RELEASED_BITEXTS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
RELEASED_BITEXTS := $(patsubst %.tar,%,${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
RELEASED_BITEXTS_REV = ${shell (for d in ${RELEASED_BITEXTS}; do echo $$d; done) | tac}
@ -154,11 +154,11 @@ print-modelname:
##-------------------------------------------
REV_LANGPAIR = ${TRG}-${SRC}
REV_MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}}
REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
REV_MULTI_TARGET_MODEL := ${shell wget -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l}
REV_MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l}
ifeq (${REV_MULTI_TARGET_MODEL},1)
REV_SRC_PREPROCESS_ARGS = ${TRG} ${SRC} ${REV_LANGPAIR}/${REV_MODELNAME}/source.spm
REV_TRG_PREPROCESS_ARGS = ${SRC} ${TRG} ${REV_LANGPAIR}/${REV_MODELNAME}/target.spm noflags
@ -267,7 +267,7 @@ extract-rawbest-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.${SRC}.rawbest$
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
${WGET} -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh

View File

@ -30,8 +30,8 @@ MARIAN_WORKSPACE=12000
TATOEBA_VERSION ?= v2021-08-07
TATOEBA_VERSION_NOHYPHEN ?= $(subst -,,${TATOEBA_VERSION})
# TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw
# TATOEBA_GITRAW = https://github.com/Helsinki-NLP/Tatoeba-Challenge/raw
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
@ -61,14 +61,14 @@ OUTPUT_DIR ?= ${NEW_LANGPAIR}
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4}
MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${TRANSLATE_LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MODELDIR = ${OUTPUT_DIR}/${TRANSLATE_LANGPAIR}/${MODELNAME}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
endif
@ -131,7 +131,7 @@ print-modelinfo:
${MODELDIR}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
wget -O ${dir $@}/model.zip ${MODELZIP}
${WGET} -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
@ -203,32 +203,55 @@ ifneq (${MODELZIP},)
gzip -c > ${PWD}/$@
endif
check-latest:
@if [ -d ${OUTPUT_DIR}/latest ]; then \
for T in `ls ${OUTPUT_DIR}/latest/*.${TRG}.gz`; do \
S=`echo $$T | sed 's/.${TRG}.gz/.${SRC}.gz/'`; \
if [ ! -e $$S ]; then \
echo "$$S does not exist!"; \
fi \
done; \
for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
if [ ! -e $$T ]; then \
echo "$$T does not exist!"; \
else \
echo "$$a $$S $$T"; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
else \
echo "$$a $$S $$T"; \
fi \
fi \
done \
done; \
fi
remove-incomplete-latest:
@echo "check ${OUTPUT_DIR}"
@mkdir -p ${OUTPUT_DIR}/incomplete/latest
@if [ -d ${OUTPUT_DIR}/latest ]; then \
for T in `ls ${OUTPUT_DIR}/latest/*.${TRG}.gz`; do \
S=`echo $$T | sed 's/.${TRG}.gz/.${SRC}.gz/'`; \
if [ ! -e $$S ]; then \
echo "$$S does not exist!"; \
mv $$T ${OUTPUT_DIR}/incomplete/latest/; \
fi \
done; \
for S in `ls ${OUTPUT_DIR}/latest/*.${SRC}.gz`; do \
T=`echo $$S | sed 's/.${SRC}.gz/.${TRG}.gz/'`; \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
if [ ! -e $$T ]; then \
echo "$$T does not exist!"; \
mv $$S ${OUTPUT_DIR}/incomplete/latest/; \
mv $$T ${OUTPUT_DIR}/incomplete/latest/; \
else \
a=`${GZCAT} $$S | wc -l`; \
b=`${GZCAT} $$T | wc -l`; \
if [ $$a != $$b ]; then \
echo "$$a != $$b $$S $$T"; \
mv $$S ${OUTPUT_DIR}/incomplete/latest/; \
mv $$T ${OUTPUT_DIR}/incomplete/latest/; \
fi \
fi \
done \
fi