added recipe for refreshing release info

This commit is contained in:
Joerg Tiedemann 2021-03-13 00:29:23 +02:00
parent 2067577021
commit bb39c060c0
11 changed files with 1085 additions and 317 deletions

View File

@ -141,6 +141,8 @@
#
#--------------------------------------------------------------------
## model-specific configuration file
MODELCONFIG = config.mk
# check and adjust lib/env.mk and lib/config.mk
@ -155,8 +157,8 @@ include lib/config.mk
# load model-specific configuration parameters
# if they exist in the work directory
ifneq ($(wildcard ${WORKDIR}/config.mk),)
include ${WORKDIR}/config.mk
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
include ${WORKDIR}/${MODELCONFIG}
endif
include lib/data.mk
@ -174,7 +176,7 @@ include lib/projects.mk
.PHONY: all
all: ${WORKDIR}/config.mk
all: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
@ -191,7 +193,7 @@ all: ${WORKDIR}/config.mk
#---------------------------------------------------------------------
.PHONY: all-and-backtranslate
all-and-backtranslate: ${WORKDIR}/config.mk
all-and-backtranslate: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
@ -210,7 +212,7 @@ all-and-backtranslate: ${WORKDIR}/config.mk
done
.PHONY: all-and-backtranslate-allwikis
all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
all-and-backtranslate-allwikis: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
@ -230,7 +232,7 @@ all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
done
.PHONY: all-and-backtranslate-allwikiparts
all-and-backtranslate-allwikiparts: ${WORKDIR}/config.mk
all-and-backtranslate-allwikiparts: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
@ -276,7 +278,7 @@ all-with-bt-allparts:
## job1: submit jobs to create data, train models, backtranslate all, and train again
job1: ${WORKDIR}/config.mk
job1: ${WORKDIR}/${MODELCONFIG}
${MAKE} HPC_MEM=12g HPC_CORES=4 job1-step1.submitcpu
job1-step1:
@ -307,7 +309,7 @@ job1-step3:
#------------------------------------------------------------------------
.PHONY: all-job
all-job: ${WORKDIR}/config.mk
all-job: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train-and-eval-job

View File

@ -19,6 +19,7 @@ TRG = eng
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
## container for storing backtranslations
@ -131,6 +132,16 @@ all2eng:
done
# RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
fetch-bt:
for d in ${RELEASED_BT}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
release-all: upload-all released-data.txt released-data-size.txt
swift upload ${BT_CONTAINER} released-data-size.txt
@ -148,7 +159,7 @@ upload-all:
done
released-data.txt: .
swift list ${BT_CONTAINER} | grep -v README.md > $@
swift list ${BT_CONTAINER} | grep -v README.md | grep -v '.txt' > $@
swift upload ${BT_CONTAINER} $@
released-data-size.txt: .

View File

@ -3,6 +3,11 @@
# model configurations
#
## name of the model-specific configuration file
MODELCONFIG ?= config.mk
## various ways of setting the model languages
## (1) explicitly set source and target languages, for example:
@ -491,7 +496,7 @@ endif
## TODO: is it OK to delete LOCAL_TRAIN data?
.PHONY: config local-config
config local-config: ${WORKDIR}/config.mk
config local-config: ${WORKDIR}/${MODELCONFIG}
SMALLEST_TRAINSIZE = 10000
SMALL_TRAINSIZE = 100000
@ -499,7 +504,7 @@ MEDIUM_TRAINSIZE = 500000
LARGE_TRAINSIZE = 1000000
LARGEST_TRAINSIZE = 10000000
${WORKDIR}/config.mk:
${WORKDIR}/${MODELCONFIG}:
mkdir -p ${dir $@}
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \

View File

@ -52,10 +52,12 @@ endif
## - use only the latest backtranslations
## if such a subdir exists
ifneq (${wildcard backtranslate/${TRG}-${SRC}/latest},)
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}/latest
BACKTRANS_HOME = backtranslate
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}/latest
else
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}
endif
## TODO: make it possible to select only parts of the BT data
@ -611,12 +613,15 @@ add-to-test-data: ${CLEAN_TEST_SRC}
@echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
@echo "more than one target language";
@echo "${ZCAT} ${CLEAN_TEST_SRC} | sed 's/^/>>${TRG}<< /' >> ${TEST_SRC}"
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null |\
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
else
@echo "only one target language"
@echo "${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}"
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null >> ${TEST_SRC}
endif
@echo "${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}"
@${ZCAT} ${CLEAN_TEST_TRG} 2>/dev/null >> ${TEST_TRG}

View File

@ -3,6 +3,10 @@
# make distribution packages
# and upload them to cPouta ObjectStorage
#
TODAY := ${shell date +%F}
DATE ?= ${TODAY}
OBJECTSTORAGE = https://object.pouta.csc.fi
MODEL_CONTAINER = OPUS-MT-models
DEV_MODEL_CONTAINER = OPUS-MT-dev
@ -14,7 +18,14 @@ RELEASEDIR = ${PWD}/models
## TODO: better create a recipe for the yaml file and not the zip file
## becaue we can keep the yaml files in the repo but not the zip files!
## --> better dependency in case we need to update and create new distributions!
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
DIST_YML = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.yml
RELEASE_README = ${MODELSHOME}/${LANGPAIRSTR}/README.md
RELEASE_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}-${DATE}.zip
RELEASE_YML = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}-${DATE}.yml
MODEL_README = ${WORKDIR}/README.md
MODEL_YML = ${patsubst %.npz,%.yml,${MODEL_FINAL}}
@ -151,7 +162,6 @@ best-dist best_dist:
## old: only accept models with a certain evaluation score:
# if [ `grep BLEU $(TEST_EVALUATION) | cut -f3 -d ' ' | cut -f1 -d '.'` -ge ${MIN_BLEU_SCORE} ]; then \
DATE = ${shell date +%F}
MODELS_URL = https://object.pouta.csc.fi/${DEV_MODEL_CONTAINER}
SKIP_DIST_EVAL = 0
@ -197,98 +207,151 @@ RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
${DIST_PACKAGE}: ${MODEL_FINAL}
ifneq (${SKIP_DIST_EVAL},1)
@${MAKE} $(TEST_EVALUATION)
@${MAKE} $(TEST_COMPARISON)
endif
model-yml: ${MODEL_YML}
model-readme: ${MODEL_README}
release-yml: ${RELEASE_YML}
release-readme: ${RELEASE_README}
${RELEASE_YML}: ${MODEL_YML}
@mkdir -p ${dir $@}
@touch ${WORKDIR}/source.tcmodel
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
##-----------------------------
## create YAML file
##-----------------------------
@echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
@echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
@echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
@echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
@echo "subwords:" >> ${@:.zip=}-${DATE}.yml
@echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
@echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
@echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
@echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
@echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
if [ -e $@ ]; then \
mkdir -p models-backup/${LANGPAIRSTR}/${TODAY}; \
mv -f $@ models-backup/${LANGPAIRSTR}/${TODAY}/; \
fi
cp $< $@
${RELEASE_README}: ${MODEL_README}
@mkdir -p ${dir $@}
if [ -e $@ ]; then \
mkdir -p models-backup/${LANGPAIRSTR}/${TODAY}; \
mv -f $@ models-backup/${LANGPAIRSTR}/${TODAY}/; \
cat models-backup/${LANGPAIRSTR}/${TODAY}/${notdir $@} |\
sed 's/^# /§/g' | tr "\n" '~' | tr '§' "\n" | grep . |\
grep -v '^${notdir ${RELEASE_PACKAGE}}' | \
sed 's/^/# /' | tr '~' "\n" > $@; \
fi
cat $< >> $@
echo '' >> $@
##---------------------------------------
## create release description file (yml)
##---------------------------------------
${MODEL_YML}: ${MODEL_FINAL}
@mkdir -p ${dir $@}
@echo "release: ${LANGPAIRSTR}/$(notdir ${RELEASE_PACKAGE})" > $@
@echo "release-date: $(DATE)" >> $@
@echo "dataset-name: $(DATASET)" >> $@
@echo "modeltype: $(MODELTYPE)" >> $@
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
@echo "subwords:" >> $@
@echo " - source: ${PRE_SRC}" >> $@
@echo " - target: ${PRE_TRG}" >> $@
@echo "subword-models:" >> $@
@echo " - source: source.${SUBWORD_TYPE}" >> $@
@echo " - target: target.${SUBWORD_TYPE}" >> $@
ifdef USE_TARGET_LABELS
@echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
@echo "use-target-labels:" >> $@
@for t in ${TRGLANGS}; do \
echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
echo " - >>$$t<<" >> $@; \
done
endif
@echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
@echo "source-languages:" >> $@
@for s in ${RAWSRCLANGS}; do\
echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
echo " - $$s" >> $@; \
done
@echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
@echo "target-languages:" >> $@
@for t in ${RAWTRGLANGS}; do\
echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
echo " - $$t" >> $@; \
done
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
@echo "training-data:" >> ${@:.zip=}-${DATE}.yml
@echo "training-data:" >> $@
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
tr "#" "\n" | grep '^ ${DATASET}~' | \
tail -1 | tr "~" "\n" | grep '^\* ' | \
grep -v ': *$$' | grep -v ' 0$$' | \
grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
grep -v 'total size' | sed 's/^\* / - /' >> $@
endif
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
@echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
@echo "validation-data:" >> $@
grep '^\* ' ${WORKDIR}/val/README.md | \
grep -v ' 0$$' | \
sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
sed 's/^\* / - /' >> $@
endif
##-----------------------------
## add benchmark results
##
## - grep and normalise test set names
## - ugly perl script that does some tansformation of language codes
##-----------------------------
ifneq ("$(wildcard ${TEST_EVALUATION})","")
@grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
cut -f3 -d ' ' > $@.2
@grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
cut -f3 -d ' ' > $@.3
@ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
cut -f16 -d ' ' | sed 's/)//' > $@.5
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
cut -f7 -d ' ' > $@.6
@paste -d '/' $@.4 $@.5 > $@.7
@echo "test-data:" >> $@
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> $@
@echo "BLEU-scores:" >> $@
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> $@
@echo "chr-F-scores:" >> $@
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> $@
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7
endif
##-----------------------------
## create README-file
##-----------------------------
@echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
@echo '' >> ${WORKDIR}/README.md
@echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
@echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
@echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
${MODEL_README}: ${MODEL_FINAL}
@echo "# $(notdir ${RELEASE_PACKAGE})" > $@
@echo '' >> $@
@echo "* dataset: ${DATASET}" >> $@
@echo "* model: ${MODELTYPE}" >> $@
@echo "* source language(s): ${RAWSRCLANGS}" >> $@
@echo "* target language(s): ${RAWTRGLANGS}" >> $@
@echo "* model: ${MODELTYPE}" >> $@
@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
ifdef USE_TARGET_LABELS
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
@echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> $@
@echo "* valid language labels: ${LANGUAGELABELS}" >> $@
endif
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
@echo "* download: [$(notdir ${RELEASE_PACKAGE})](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${RELEASE_PACKAGE})" >> $@
ifneq (${SKIP_DATA_DETAILS},1)
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
@echo -n "## Training data: " >> ${WORKDIR}/README.md
@echo -n "## Training data: " >> $@
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
tr "#" "\n" | grep '${DATASET}' | \
tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
@echo '' >> ${WORKDIR}/README.md
tail -1 | tr "~" "\n" >> $@
@echo '' >> $@
endif
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
@echo -n "#" >> ${WORKDIR}/README.md
@cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
@echo '' >> ${WORKDIR}/README.md
@echo -n "#" >> $@
@cat ${WORKDIR}/val/README.md >> $@
@echo '' >> $@
endif
endif
##-----------------------------
## add benchmark results
##-----------------------------
ifneq ("$(wildcard ${TEST_EVALUATION})","")
@echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
@echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
@echo '' >> ${WORKDIR}/README.md
@echo '## Benchmarks' >> ${WORKDIR}/README.md
@echo '' >> ${WORKDIR}/README.md
@echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> $@
@echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> $@
@echo '' >> $@
@echo '## Benchmarks' >> $@
@echo '' >> $@
## grep and normalise test set names
## ugly perl script that does some tansformation of language codes
@grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
@ -305,62 +368,299 @@ ifneq ("$(wildcard ${TEST_EVALUATION})","")
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
cut -f7 -d ' ' > $@.6
@paste -d '/' $@.4 $@.5 > $@.7
@echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
@echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
@echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> $@
@echo '|---------|-------|-------|-------|--------|----|' >> $@
@paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
sort | uniq >> ${WORKDIR}/README.md
@echo "test-data:" >> ${@:.zip=}-${DATE}.yml
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
@echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
@echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
sort | uniq >> $@
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7
endif
${DIST_PACKAGE}: ${MODEL_FINAL}
ifneq (${SKIP_DIST_EVAL},1)
@${MAKE} $(TEST_EVALUATION)
@${MAKE} $(TEST_COMPARISON)
endif
##-----------------------------
## create the package
## collect all files we need
##-----------------------------
@cat ${WORKDIR}/README.md >> ${dir $@}README.md
@echo '' >> ${dir $@}README.md
@cp models/LICENSE ${WORKDIR}/
@${MAKE} ${MODEL_README}
@${MAKE} ${MODEL_YML}
@touch ${WORKDIR}/source.tcmodel
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
@chmod +x ${WORKDIR}/preprocess.sh
@cp models/LICENSE ${WORKDIR}/
@sed -e 's# - .*/\([^/]*\)$$# - \1#' \
-e 's/beam-size: [0-9]*$$/beam-size: 6/' \
-e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
-e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
-e 's/relative-paths: false/relative-paths: true/' \
< ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
##-----------------------------
## create the package
##-----------------------------
cd ${WORKDIR} && zip ${notdir $@} \
README.md LICENSE \
${notdir ${MODEL_FINAL}} \
${notdir ${MODEL_YML}} \
${notdir ${MODEL_SRCVOCAB}} \
${notdir ${MODEL_TRGVOCAB}} \
${notdir ${MODEL_VALIDLOG}} \
${notdir ${MODEL_TRAINLOG}} \
source.* target.* decoder.yml \
preprocess.sh postprocess.sh
ifneq ("$(wildcard ${WORKDIR}/config.mk)","")
@cd ${WORKDIR} && zip -u ${notdir $@} config.mk
ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
@cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
endif
##-----------------------------
## move files to release dir and cleanup
##-----------------------------
@mkdir -p ${dir $@}
@mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
@cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
@if [ -e ${RELEASE_PACKAGE} ]; then \
mkdir -p models-backup/${LANGPAIRSTR}/${DATE}; \
mv -f ${RELEASE_PACKAGE} models-backup/${LANGPAIRSTR}/${DATE}/; \
mv -f ${@:.zip=}-${DATE}.eval.txt models-backup/${LANGPAIRSTR}/${DATE}/; \
mv -f ${@:.zip=}-${DATE}.test.txt models-backup/${LANGPAIRSTR}/${DATE}/; \
fi
@mv -f ${WORKDIR}/${notdir $@} ${RELEASE_PACKAGE}
@${MAKE} ${RELEASE_YML}
@${MAKE} ${RELEASE_README}
ifneq ("$(wildcard ${TEST_EVALUATION})","")
@cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
@cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
endif
@rm -f $@
@cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
@cd ${dir $@} && ln -s $(notdir ${RELEASE_PACKAGE}) ${notdir $@}
@rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
@rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
## refresh a release with the same time stamp
## in case it is already the newest one
## --> this is kind of dangerous as we may overwrite existing newer ones with older ones
## --> the reason for doing this is to update yml files and evaluation scores
refresh-release:
if [[ ${DIST_PACKAGE} -nt ${MODEL_FINAL} ]]; then \
echo "updating ${shell realpath ${DIST_PACKAGE}}"; \
d=`realpath ${DIST_PACKAGE} | xargs basename | sed 's/^[^\-]*\-//;s/\.zip$$//'`; \
mkdir -p models-backup/${LANGPAIRSTR}/${DATE}; \
mv -f ${shell realpath ${DIST_PACKAGE}} models-backup/${LANGPAIRSTR}/${DATE}/; \
make DATE="$$d" release; \
fi
refresh-release-yml:
if [[ ${DIST_PACKAGE} -nt ${MODEL_FINAL} ]]; then \
echo "updating ${patsubst %.zip,%.yml,${shell realpath ${DIST_PACKAGE}}}"; \
d=`realpath ${DIST_PACKAGE} | xargs basename | sed 's/^[^\-]*\-//;s/\.zip$$//'`; \
if [ -e ${MODEL_YML} ]; then \
mv ${MODEL_YML} ${MODEL_YML}.${DATE}; \
fi; \
make DATE="$$d" release-yml; \
fi
refresh-release-readme:
if [[ ${DIST_PACKAGE} -nt ${MODEL_FINAL} ]]; then \
echo "updating ${LANGPAIRSTR}/README.md for ${notdir ${shell realpath ${DIST_PACKAGE}}}"; \
d=`realpath ${DIST_PACKAGE} | xargs basename | sed 's/^[^\-]*\-//;s/\.zip$$//'`; \
if [ -e ${MODEL_README} ]; then \
mv ${MODEL_README} ${MODEL_README}.${DATE}; \
fi; \
make DATE="$$d" release-readme; \
fi
##### ------------------------------------
##### OLD release recipe: all in one
##### ------------------------------------
# ${DIST_PACKAGE}: ${MODEL_FINAL}
# ifneq (${SKIP_DIST_EVAL},1)
# @${MAKE} $(TEST_EVALUATION)
# @${MAKE} $(TEST_COMPARISON)
# endif
# @mkdir -p ${dir $@}
# @touch ${WORKDIR}/source.tcmodel
# @cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
# @cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
# @cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
# @cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
# ##-----------------------------
# ## create YML file
# ##-----------------------------
# @echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
# @echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
# @echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
# @echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
# @echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
# @echo "subwords:" >> ${@:.zip=}-${DATE}.yml
# @echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
# @echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
# @echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
# @echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
# @echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
# ifdef USE_TARGET_LABELS
# @echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
# @for t in ${TRGLANGS}; do \
# echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
# done
# endif
# @echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
# @for s in ${RAWSRCLANGS}; do\
# echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
# done
# @echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
# @for t in ${RAWTRGLANGS}; do\
# echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
# done
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
# @echo "training-data:" >> ${@:.zip=}-${DATE}.yml
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
# tr "#" "\n" | grep '^ ${DATASET}~' | \
# tail -1 | tr "~" "\n" | grep '^\* ' | \
# grep -v ': *$$' | grep -v ' 0$$' | \
# grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
# endif
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
# @echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
# grep '^\* ' ${WORKDIR}/val/README.md | \
# grep -v ' 0$$' | \
# sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
# endif
# ##-----------------------------
# ## create README-file
# ##-----------------------------
# @echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# @echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
# @echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
# @echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
# @echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
# ifdef USE_TARGET_LABELS
# echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
# @echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
# endif
# @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
# ifneq (${SKIP_DATA_DETAILS},1)
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
# @echo -n "## Training data: " >> ${WORKDIR}/README.md
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
# tr "#" "\n" | grep '${DATASET}' | \
# tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# endif
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
# @echo -n "#" >> ${WORKDIR}/README.md
# @cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# endif
# endif
# ##-----------------------------
# ## add benchmark results
# ##-----------------------------
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
# @echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
# @echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# @echo '## Benchmarks' >> ${WORKDIR}/README.md
# @echo '' >> ${WORKDIR}/README.md
# ## grep and normalise test set names
# ## ugly perl script that does some tansformation of language codes
# @grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
# perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f3 -d ' ' > $@.2
# @grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f3 -d ' ' > $@.3
# @ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f16 -d ' ' | sed 's/)//' > $@.5
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
# cut -f7 -d ' ' > $@.6
# @paste -d '/' $@.4 $@.5 > $@.7
# @echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
# @echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
# @paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
# sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
# sort | uniq >> ${WORKDIR}/README.md
# @echo "test-data:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
# @echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
# @echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
# @paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
# @rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
# endif
# ##-----------------------------
# ## create the package
# ##-----------------------------
# @cat ${WORKDIR}/README.md >> ${dir $@}README.md
# @echo '' >> ${dir $@}README.md
# @cp models/LICENSE ${WORKDIR}/
# @chmod +x ${WORKDIR}/preprocess.sh
# @sed -e 's# - .*/\([^/]*\)$$# - \1#' \
# -e 's/beam-size: [0-9]*$$/beam-size: 6/' \
# -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
# -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
# -e 's/relative-paths: false/relative-paths: true/' \
# < ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
# cd ${WORKDIR} && zip ${notdir $@} \
# README.md LICENSE \
# ${notdir ${MODEL_FINAL}} \
# ${notdir ${MODEL_SRCVOCAB}} \
# ${notdir ${MODEL_TRGVOCAB}} \
# ${notdir ${MODEL_VALIDLOG}} \
# ${notdir ${MODEL_TRAINLOG}} \
# source.* target.* decoder.yml \
# preprocess.sh postprocess.sh
# ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
# @cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
# endif
# ##-----------------------------
# ## move files to release dir and cleanup
# ##-----------------------------
# @mkdir -p ${dir $@}
# @mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
# @cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
# @cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
# @cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
# endif
# @rm -f $@
# @cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
# @rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
# @rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
## do this only if the flag is set

View File

@ -247,6 +247,9 @@ BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.yml
BT_MARIAN_EARLY_STOPPING = 15
# %-add-backtranslations:
%-bt:
ifneq (${wildcard ${MODEL_FINAL}},)
@ -256,8 +259,11 @@ ifeq (${wildcard ${BT_MODEL_START}},)
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 \
MARIAN_EARLY_STOPPING=15 \
${MAKE} DATASET=${DATASET}+bt \
USE_BACKTRANS=1 \
CONTINUE_EXISTING=1 \
MODELCONFIG=config-bt.mk \
MARIAN_EARLY_STOPPING=${BT_MARIAN_EARLY_STOPPING} \
${@:-bt=}
# CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
@ -280,6 +286,7 @@ endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=${DATASET}+pivot \
USE_PIVOTING=1 \
CONTINUE_EXISTING=1 \
MARIAN_EARLY_STOPPING=10 \
${@:-pivot=}

View File

@ -1,6 +1,64 @@
# -*-makefile-*-
FIU2XXX = $(wildcard models-tatoeba/fiu-???)
XXX2FIU = $(wildcard models-tatoeba/???-fiu)
fiu2xxx-print-results:
@for d in ${FIU2XXX}; do \
s='fiu';\
t=`echo $$d | cut -f3 -d'-'`;\
echo '\begin{table}[]'; \
echo '\centering'; \
echo '\begin{tabular}{|c|cc|}'; \
echo '\hline'; \
echo "$$s-$$t & chr-F2 & BLEU \\\\"; \
echo '\hline'; \
cat $$d/README.md |\
tr "\n#" "~\n" | tail -1 | tr '~' "\n" |\
grep 'Tatoeba-test' | \
sed 's/Tatoeba-test\.//' |\
perl -e 'while (<>){@a=split(/\s*\|\s*/);print if ($$a[4]>=100);}' |\
cut -f2-4 -d'|' | tr '|' '&' | sed 's/$$/\\\\/'; \
echo '\end{tabular}'; \
echo -n '\caption{Results from the multilingual translation model between Finno-Ugric languages and '; \
iso639 $$t | tr '"' ' '; \
echo 'measured on the Tatoeba test set.}'; \
echo '\label{tab:my_label}'; \
echo '\end{table}'; \
echo ""; \
done
xxx2fiu-print-results:
@for d in ${XXX2FIU}; do \
t='fiu';\
s=`echo $$d | cut -f2 -d'/' | cut -f1 -d'-'`;\
echo '\begin{table}[]'; \
echo '\centering'; \
echo '\begin{tabular}{|c|cc|}'; \
echo '\hline'; \
echo "$$s-$$t & chr-F2 & BLEU \\\\"; \
echo '\hline'; \
cat $$d/README.md |\
tr "\n#" "~\n" | tail -1 | tr '~' "\n" |\
grep 'Tatoeba-test' | \
sed 's/Tatoeba-test\.//' |\
perl -e 'while (<>){@a=split(/\s*\|\s*/);print if ($$a[4]>=100);}' |\
cut -f2-4 -d'|' | tr '|' '&' | sed 's/$$/\\\\/'; \
echo '\end{tabular}'; \
echo -n '\caption{Results from the multilingual translation model between '; \
iso639 $$s | tr '"' ' '; \
echo 'and Finno-Ugric languages measured on the Tatoeba test set.}'; \
echo '\label{tab:my_label}'; \
echo '\end{table}'; \
echo ""; \
done
# FIU_DATASIZE = -1m
train-tatoeba-crossfiu: train-tatoeba-group2fiu train-tatoeba-fiu2group

File diff suppressed because it is too large Load Diff

87
scripts/pivot-bt.pl Normal file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env perl
#
use strict;
use open qw/:std :utf8/;
use Getopt::Long;
my $AlphaOnly = 0;
my $WordOnly = 1;
my $LowerCase = 1;
my $verbose = 0;
my @SrcFiles = ();
my @SrcPivotFiles = ();
my @TrgPivotFiles = ();
my @TrgFiles = ();
GetOptions(
"srcfiles|s=s{,}" => \@SrcFiles,
"srcpivotfiles|p1=s{,}" => \@SrcPivotFiles,
"trgpivotfiles|p2=s{,}" => \@TrgPivotFiles,
"trgfiles|t=s{,}" => \@TrgFiles,
"alpha|a" => \$AlphaOnly,
"word|w" => \$WordOnly,
"lower-case|l" => \$LowerCase,
"verbose|v" => \$verbose );
my %pivot2src = ();
while (@SrcFiles){
my $srcfile = shift(@SrcFiles);
my $srcpivot = shift(@SrcPivotFiles);
print STDERR "read $srcfile $srcpivot ...\n";
open S,"gzip -cd <$srcfile |" || die "cannot read from $srcfile";
open T,"gzip -cd <$srcpivot |" || die "cannot read from $srcpivot";
while (<S>){
chomp;
my $trg = <T>;
chomp($trg);
my $key = make_key($trg);
$pivot2src{$key} = $_ if ($key);
}
close S;
close T;
}
while (@TrgFiles){
my $trgfile = shift(@TrgFiles);
my $trgpivot = shift(@TrgPivotFiles);
print STDERR "checking $trgfile $trgpivot ...\n";
open S,"gzip -cd <$trgpivot |" || die "cannot read from $trgpivot";
open T,"gzip -cd <$trgfile |" || die "cannot read from $trgfile";
while (<S>){
chomp;
my $trg = <T>;
chomp($trg);
my $key = make_key($_);
next unless ($key);
if (exists $pivot2src{$key}){
print $pivot2src{$key},"\t",$trg,"\n";
print STDERR "matching key '$key'\n" if ($verbose);
}
}
close S;
close T;
}
sub make_key{
my $string = shift;
if ($AlphaOnly){
$string=~s/\P{IsAlpha}//gs;
}
if ($WordOnly){
$string=~s/\P{IsWord}//gs;
}
if ($LowerCase){
$string=lc($string);
}
return $string;
}

View File

@ -0,0 +1 @@
newstest2014-deen.de.gz

View File

@ -0,0 +1 @@
newstest2014-deen.en.gz