mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
added recipe for refreshing release info
This commit is contained in:
parent
2067577021
commit
bb39c060c0
18
Makefile
18
Makefile
@ -141,6 +141,8 @@
|
||||
#
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
## model-specific configuration file
|
||||
MODELCONFIG = config.mk
|
||||
|
||||
# check and adjust lib/env.mk and lib/config.mk
|
||||
|
||||
@ -155,8 +157,8 @@ include lib/config.mk
|
||||
# load model-specific configuration parameters
|
||||
# if they exist in the work directory
|
||||
|
||||
ifneq ($(wildcard ${WORKDIR}/config.mk),)
|
||||
include ${WORKDIR}/config.mk
|
||||
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
|
||||
include ${WORKDIR}/${MODELCONFIG}
|
||||
endif
|
||||
|
||||
include lib/data.mk
|
||||
@ -174,7 +176,7 @@ include lib/projects.mk
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: ${WORKDIR}/config.mk
|
||||
all: ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train
|
||||
${MAKE} eval
|
||||
@ -191,7 +193,7 @@ all: ${WORKDIR}/config.mk
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
.PHONY: all-and-backtranslate
|
||||
all-and-backtranslate: ${WORKDIR}/config.mk
|
||||
all-and-backtranslate: ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train
|
||||
${MAKE} eval
|
||||
@ -210,7 +212,7 @@ all-and-backtranslate: ${WORKDIR}/config.mk
|
||||
done
|
||||
|
||||
.PHONY: all-and-backtranslate-allwikis
|
||||
all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
|
||||
all-and-backtranslate-allwikis: ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train
|
||||
${MAKE} eval
|
||||
@ -230,7 +232,7 @@ all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
|
||||
done
|
||||
|
||||
.PHONY: all-and-backtranslate-allwikiparts
|
||||
all-and-backtranslate-allwikiparts: ${WORKDIR}/config.mk
|
||||
all-and-backtranslate-allwikiparts: ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train
|
||||
${MAKE} eval
|
||||
@ -276,7 +278,7 @@ all-with-bt-allparts:
|
||||
|
||||
## job1: submit jobs to create data, train models, backtranslate all, and train again
|
||||
|
||||
job1: ${WORKDIR}/config.mk
|
||||
job1: ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} HPC_MEM=12g HPC_CORES=4 job1-step1.submitcpu
|
||||
|
||||
job1-step1:
|
||||
@ -307,7 +309,7 @@ job1-step3:
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
.PHONY: all-job
|
||||
all-job: ${WORKDIR}/config.mk
|
||||
all-job: ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train-and-eval-job
|
||||
|
||||
|
@ -19,6 +19,7 @@ TRG = eng
|
||||
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
|
||||
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results-all.txt
|
||||
TATOEBA_RELEASED_BT = https://object.pouta.csc.fi/Tatoeba-MT-bt/released-data.txt
|
||||
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
|
||||
## container for storing backtranslations
|
||||
@ -131,6 +132,16 @@ all2eng:
|
||||
done
|
||||
|
||||
|
||||
# RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
|
||||
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
|
||||
|
||||
fetch-bt:
|
||||
for d in ${RELEASED_BT}; do \
|
||||
echo "fetch $$d"; \
|
||||
mkdir -p `dirname $$d`; \
|
||||
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
|
||||
done
|
||||
|
||||
release-all: upload-all released-data.txt released-data-size.txt
|
||||
swift upload ${BT_CONTAINER} released-data-size.txt
|
||||
|
||||
@ -148,7 +159,7 @@ upload-all:
|
||||
done
|
||||
|
||||
released-data.txt: .
|
||||
swift list ${BT_CONTAINER} | grep -v README.md > $@
|
||||
swift list ${BT_CONTAINER} | grep -v README.md | grep -v '.txt' > $@
|
||||
swift upload ${BT_CONTAINER} $@
|
||||
|
||||
released-data-size.txt: .
|
||||
|
@ -3,6 +3,11 @@
|
||||
# model configurations
|
||||
#
|
||||
|
||||
|
||||
## name of the model-specific configuration file
|
||||
MODELCONFIG ?= config.mk
|
||||
|
||||
|
||||
## various ways of setting the model languages
|
||||
|
||||
## (1) explicitly set source and target languages, for example:
|
||||
@ -491,7 +496,7 @@ endif
|
||||
## TODO: is it OK to delete LOCAL_TRAIN data?
|
||||
|
||||
.PHONY: config local-config
|
||||
config local-config: ${WORKDIR}/config.mk
|
||||
config local-config: ${WORKDIR}/${MODELCONFIG}
|
||||
|
||||
SMALLEST_TRAINSIZE = 10000
|
||||
SMALL_TRAINSIZE = 100000
|
||||
@ -499,7 +504,7 @@ MEDIUM_TRAINSIZE = 500000
|
||||
LARGE_TRAINSIZE = 1000000
|
||||
LARGEST_TRAINSIZE = 10000000
|
||||
|
||||
${WORKDIR}/config.mk:
|
||||
${WORKDIR}/${MODELCONFIG}:
|
||||
mkdir -p ${dir $@}
|
||||
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
|
||||
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
|
||||
|
11
lib/data.mk
11
lib/data.mk
@ -52,10 +52,12 @@ endif
|
||||
## - use only the latest backtranslations
|
||||
## if such a subdir exists
|
||||
|
||||
ifneq (${wildcard backtranslate/${TRG}-${SRC}/latest},)
|
||||
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}/latest
|
||||
BACKTRANS_HOME = backtranslate
|
||||
|
||||
ifneq (${wildcard ${BACKTRANS_HOME}/${TRG}-${SRC}/latest},)
|
||||
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}/latest
|
||||
else
|
||||
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
|
||||
BACKTRANS_DIR = ${BACKTRANS_HOME}/${TRG}-${SRC}
|
||||
endif
|
||||
|
||||
## TODO: make it possible to select only parts of the BT data
|
||||
@ -611,12 +613,15 @@ add-to-test-data: ${CLEAN_TEST_SRC}
|
||||
@echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
@echo "more than one target language";
|
||||
@echo "${ZCAT} ${CLEAN_TEST_SRC} | sed 's/^/>>${TRG}<< /' >> ${TEST_SRC}"
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
|
||||
else
|
||||
@echo "only one target language"
|
||||
@echo "${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}"
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null >> ${TEST_SRC}
|
||||
endif
|
||||
@echo "${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}"
|
||||
@${ZCAT} ${CLEAN_TEST_TRG} 2>/dev/null >> ${TEST_TRG}
|
||||
|
||||
|
||||
|
454
lib/dist.mk
454
lib/dist.mk
@ -3,6 +3,10 @@
|
||||
# make distribution packages
|
||||
# and upload them to cPouta ObjectStorage
|
||||
#
|
||||
|
||||
TODAY := ${shell date +%F}
|
||||
DATE ?= ${TODAY}
|
||||
|
||||
OBJECTSTORAGE = https://object.pouta.csc.fi
|
||||
MODEL_CONTAINER = OPUS-MT-models
|
||||
DEV_MODEL_CONTAINER = OPUS-MT-dev
|
||||
@ -14,7 +18,14 @@ RELEASEDIR = ${PWD}/models
|
||||
## TODO: better create a recipe for the yaml file and not the zip file
|
||||
## becaue we can keep the yaml files in the repo but not the zip files!
|
||||
## --> better dependency in case we need to update and create new distributions!
|
||||
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
|
||||
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
|
||||
DIST_YML = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.yml
|
||||
RELEASE_README = ${MODELSHOME}/${LANGPAIRSTR}/README.md
|
||||
RELEASE_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}-${DATE}.zip
|
||||
RELEASE_YML = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}-${DATE}.yml
|
||||
|
||||
MODEL_README = ${WORKDIR}/README.md
|
||||
MODEL_YML = ${patsubst %.npz,%.yml,${MODEL_FINAL}}
|
||||
|
||||
|
||||
|
||||
@ -151,7 +162,6 @@ best-dist best_dist:
|
||||
## old: only accept models with a certain evaluation score:
|
||||
# if [ `grep BLEU $(TEST_EVALUATION) | cut -f3 -d ' ' | cut -f1 -d '.'` -ge ${MIN_BLEU_SCORE} ]; then \
|
||||
|
||||
DATE = ${shell date +%F}
|
||||
MODELS_URL = https://object.pouta.csc.fi/${DEV_MODEL_CONTAINER}
|
||||
SKIP_DIST_EVAL = 0
|
||||
|
||||
@ -197,98 +207,151 @@ RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}
|
||||
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
|
||||
|
||||
|
||||
${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
ifneq (${SKIP_DIST_EVAL},1)
|
||||
@${MAKE} $(TEST_EVALUATION)
|
||||
@${MAKE} $(TEST_COMPARISON)
|
||||
endif
|
||||
model-yml: ${MODEL_YML}
|
||||
model-readme: ${MODEL_README}
|
||||
release-yml: ${RELEASE_YML}
|
||||
release-readme: ${RELEASE_README}
|
||||
|
||||
${RELEASE_YML}: ${MODEL_YML}
|
||||
@mkdir -p ${dir $@}
|
||||
@touch ${WORKDIR}/source.tcmodel
|
||||
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
|
||||
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
|
||||
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
|
||||
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
|
||||
##-----------------------------
|
||||
## create YAML file
|
||||
##-----------------------------
|
||||
@echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
|
||||
@echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "subwords:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
|
||||
if [ -e $@ ]; then \
|
||||
mkdir -p models-backup/${LANGPAIRSTR}/${TODAY}; \
|
||||
mv -f $@ models-backup/${LANGPAIRSTR}/${TODAY}/; \
|
||||
fi
|
||||
cp $< $@
|
||||
|
||||
${RELEASE_README}: ${MODEL_README}
|
||||
@mkdir -p ${dir $@}
|
||||
if [ -e $@ ]; then \
|
||||
mkdir -p models-backup/${LANGPAIRSTR}/${TODAY}; \
|
||||
mv -f $@ models-backup/${LANGPAIRSTR}/${TODAY}/; \
|
||||
cat models-backup/${LANGPAIRSTR}/${TODAY}/${notdir $@} |\
|
||||
sed 's/^# /§/g' | tr "\n" '~' | tr '§' "\n" | grep . |\
|
||||
grep -v '^${notdir ${RELEASE_PACKAGE}}' | \
|
||||
sed 's/^/# /' | tr '~' "\n" > $@; \
|
||||
fi
|
||||
cat $< >> $@
|
||||
echo '' >> $@
|
||||
|
||||
|
||||
##---------------------------------------
|
||||
## create release description file (yml)
|
||||
##---------------------------------------
|
||||
|
||||
${MODEL_YML}: ${MODEL_FINAL}
|
||||
@mkdir -p ${dir $@}
|
||||
@echo "release: ${LANGPAIRSTR}/$(notdir ${RELEASE_PACKAGE})" > $@
|
||||
@echo "release-date: $(DATE)" >> $@
|
||||
@echo "dataset-name: $(DATASET)" >> $@
|
||||
@echo "modeltype: $(MODELTYPE)" >> $@
|
||||
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
|
||||
@echo "subwords:" >> $@
|
||||
@echo " - source: ${PRE_SRC}" >> $@
|
||||
@echo " - target: ${PRE_TRG}" >> $@
|
||||
@echo "subword-models:" >> $@
|
||||
@echo " - source: source.${SUBWORD_TYPE}" >> $@
|
||||
@echo " - target: target.${SUBWORD_TYPE}" >> $@
|
||||
ifdef USE_TARGET_LABELS
|
||||
@echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "use-target-labels:" >> $@
|
||||
@for t in ${TRGLANGS}; do \
|
||||
echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
|
||||
echo " - >>$$t<<" >> $@; \
|
||||
done
|
||||
endif
|
||||
@echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "source-languages:" >> $@
|
||||
@for s in ${RAWSRCLANGS}; do\
|
||||
echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
|
||||
echo " - $$s" >> $@; \
|
||||
done
|
||||
@echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "target-languages:" >> $@
|
||||
@for t in ${RAWTRGLANGS}; do\
|
||||
echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
|
||||
echo " - $$t" >> $@; \
|
||||
done
|
||||
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
@echo "training-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "training-data:" >> $@
|
||||
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
tr "#" "\n" | grep '^ ${DATASET}~' | \
|
||||
tail -1 | tr "~" "\n" | grep '^\* ' | \
|
||||
grep -v ': *$$' | grep -v ' 0$$' | \
|
||||
grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
|
||||
grep -v 'total size' | sed 's/^\* / - /' >> $@
|
||||
endif
|
||||
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
@echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "validation-data:" >> $@
|
||||
grep '^\* ' ${WORKDIR}/val/README.md | \
|
||||
grep -v ' 0$$' | \
|
||||
sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
|
||||
sed 's/^\* / - /' >> $@
|
||||
endif
|
||||
##-----------------------------
|
||||
## add benchmark results
|
||||
##
|
||||
## - grep and normalise test set names
|
||||
## - ugly perl script that does some tansformation of language codes
|
||||
##-----------------------------
|
||||
ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
@grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
|
||||
perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
|
||||
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
cut -f3 -d ' ' > $@.2
|
||||
@grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
cut -f3 -d ' ' > $@.3
|
||||
@ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
|
||||
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
cut -f16 -d ' ' | sed 's/)//' > $@.5
|
||||
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
cut -f7 -d ' ' > $@.6
|
||||
@paste -d '/' $@.4 $@.5 > $@.7
|
||||
@echo "test-data:" >> $@
|
||||
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> $@
|
||||
@echo "BLEU-scores:" >> $@
|
||||
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> $@
|
||||
@echo "chr-F-scores:" >> $@
|
||||
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> $@
|
||||
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7
|
||||
endif
|
||||
|
||||
|
||||
|
||||
##-----------------------------
|
||||
## create README-file
|
||||
##-----------------------------
|
||||
@echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
|
||||
@echo '' >> ${WORKDIR}/README.md
|
||||
@echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
|
||||
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
@echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
|
||||
@echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
|
||||
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
|
||||
|
||||
${MODEL_README}: ${MODEL_FINAL}
|
||||
@echo "# $(notdir ${RELEASE_PACKAGE})" > $@
|
||||
@echo '' >> $@
|
||||
@echo "* dataset: ${DATASET}" >> $@
|
||||
@echo "* model: ${MODELTYPE}" >> $@
|
||||
@echo "* source language(s): ${RAWSRCLANGS}" >> $@
|
||||
@echo "* target language(s): ${RAWTRGLANGS}" >> $@
|
||||
@echo "* model: ${MODELTYPE}" >> $@
|
||||
@echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
|
||||
ifdef USE_TARGET_LABELS
|
||||
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
|
||||
@echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
|
||||
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> $@
|
||||
@echo "* valid language labels: ${LANGUAGELABELS}" >> $@
|
||||
endif
|
||||
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
||||
@echo "* download: [$(notdir ${RELEASE_PACKAGE})](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${RELEASE_PACKAGE})" >> $@
|
||||
ifneq (${SKIP_DATA_DETAILS},1)
|
||||
ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
@echo -n "## Training data: " >> ${WORKDIR}/README.md
|
||||
@echo -n "## Training data: " >> $@
|
||||
@tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
tr "#" "\n" | grep '${DATASET}' | \
|
||||
tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
|
||||
@echo '' >> ${WORKDIR}/README.md
|
||||
tail -1 | tr "~" "\n" >> $@
|
||||
@echo '' >> $@
|
||||
endif
|
||||
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
@echo -n "#" >> ${WORKDIR}/README.md
|
||||
@cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
|
||||
@echo '' >> ${WORKDIR}/README.md
|
||||
@echo -n "#" >> $@
|
||||
@cat ${WORKDIR}/val/README.md >> $@
|
||||
@echo '' >> $@
|
||||
endif
|
||||
endif
|
||||
##-----------------------------
|
||||
## add benchmark results
|
||||
##-----------------------------
|
||||
ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
@echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
|
||||
@echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
|
||||
@echo '' >> ${WORKDIR}/README.md
|
||||
@echo '## Benchmarks' >> ${WORKDIR}/README.md
|
||||
@echo '' >> ${WORKDIR}/README.md
|
||||
@echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> $@
|
||||
@echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> $@
|
||||
@echo '' >> $@
|
||||
@echo '## Benchmarks' >> $@
|
||||
@echo '' >> $@
|
||||
## grep and normalise test set names
|
||||
## ugly perl script that does some tansformation of language codes
|
||||
@grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
@ -305,62 +368,299 @@ ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
@grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
cut -f7 -d ' ' > $@.6
|
||||
@paste -d '/' $@.4 $@.5 > $@.7
|
||||
@echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
|
||||
@echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
|
||||
@echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> $@
|
||||
@echo '|---------|-------|-------|-------|--------|----|' >> $@
|
||||
@paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
|
||||
sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
|
||||
sort | uniq >> ${WORKDIR}/README.md
|
||||
@echo "test-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
|
||||
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
|
||||
@echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
|
||||
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
|
||||
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
|
||||
sort | uniq >> $@
|
||||
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
ifneq (${SKIP_DIST_EVAL},1)
|
||||
@${MAKE} $(TEST_EVALUATION)
|
||||
@${MAKE} $(TEST_COMPARISON)
|
||||
endif
|
||||
##-----------------------------
|
||||
## create the package
|
||||
## collect all files we need
|
||||
##-----------------------------
|
||||
@cat ${WORKDIR}/README.md >> ${dir $@}README.md
|
||||
@echo '' >> ${dir $@}README.md
|
||||
@cp models/LICENSE ${WORKDIR}/
|
||||
@${MAKE} ${MODEL_README}
|
||||
@${MAKE} ${MODEL_YML}
|
||||
@touch ${WORKDIR}/source.tcmodel
|
||||
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
|
||||
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
|
||||
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
|
||||
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
|
||||
@chmod +x ${WORKDIR}/preprocess.sh
|
||||
@cp models/LICENSE ${WORKDIR}/
|
||||
@sed -e 's# - .*/\([^/]*\)$$# - \1#' \
|
||||
-e 's/beam-size: [0-9]*$$/beam-size: 6/' \
|
||||
-e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
|
||||
-e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
|
||||
-e 's/relative-paths: false/relative-paths: true/' \
|
||||
< ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
|
||||
##-----------------------------
|
||||
## create the package
|
||||
##-----------------------------
|
||||
cd ${WORKDIR} && zip ${notdir $@} \
|
||||
README.md LICENSE \
|
||||
${notdir ${MODEL_FINAL}} \
|
||||
${notdir ${MODEL_YML}} \
|
||||
${notdir ${MODEL_SRCVOCAB}} \
|
||||
${notdir ${MODEL_TRGVOCAB}} \
|
||||
${notdir ${MODEL_VALIDLOG}} \
|
||||
${notdir ${MODEL_TRAINLOG}} \
|
||||
source.* target.* decoder.yml \
|
||||
preprocess.sh postprocess.sh
|
||||
ifneq ("$(wildcard ${WORKDIR}/config.mk)","")
|
||||
@cd ${WORKDIR} && zip -u ${notdir $@} config.mk
|
||||
ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
|
||||
@cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
|
||||
endif
|
||||
##-----------------------------
|
||||
## move files to release dir and cleanup
|
||||
##-----------------------------
|
||||
@mkdir -p ${dir $@}
|
||||
@mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
|
||||
@cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
|
||||
@if [ -e ${RELEASE_PACKAGE} ]; then \
|
||||
mkdir -p models-backup/${LANGPAIRSTR}/${DATE}; \
|
||||
mv -f ${RELEASE_PACKAGE} models-backup/${LANGPAIRSTR}/${DATE}/; \
|
||||
mv -f ${@:.zip=}-${DATE}.eval.txt models-backup/${LANGPAIRSTR}/${DATE}/; \
|
||||
mv -f ${@:.zip=}-${DATE}.test.txt models-backup/${LANGPAIRSTR}/${DATE}/; \
|
||||
fi
|
||||
@mv -f ${WORKDIR}/${notdir $@} ${RELEASE_PACKAGE}
|
||||
@${MAKE} ${RELEASE_YML}
|
||||
@${MAKE} ${RELEASE_README}
|
||||
ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
@cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
|
||||
@cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
|
||||
endif
|
||||
@rm -f $@
|
||||
@cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
|
||||
@cd ${dir $@} && ln -s $(notdir ${RELEASE_PACKAGE}) ${notdir $@}
|
||||
@rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
|
||||
@rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
|
||||
|
||||
|
||||
|
||||
|
||||
## refresh a release with the same time stamp
|
||||
## in case it is already the newest one
|
||||
## --> this is kind of dangerous as we may overwrite existing newer ones with older ones
|
||||
## --> the reason for doing this is to update yml files and evaluation scores
|
||||
|
||||
refresh-release:
|
||||
if [[ ${DIST_PACKAGE} -nt ${MODEL_FINAL} ]]; then \
|
||||
echo "updating ${shell realpath ${DIST_PACKAGE}}"; \
|
||||
d=`realpath ${DIST_PACKAGE} | xargs basename | sed 's/^[^\-]*\-//;s/\.zip$$//'`; \
|
||||
mkdir -p models-backup/${LANGPAIRSTR}/${DATE}; \
|
||||
mv -f ${shell realpath ${DIST_PACKAGE}} models-backup/${LANGPAIRSTR}/${DATE}/; \
|
||||
make DATE="$$d" release; \
|
||||
fi
|
||||
|
||||
refresh-release-yml:
|
||||
if [[ ${DIST_PACKAGE} -nt ${MODEL_FINAL} ]]; then \
|
||||
echo "updating ${patsubst %.zip,%.yml,${shell realpath ${DIST_PACKAGE}}}"; \
|
||||
d=`realpath ${DIST_PACKAGE} | xargs basename | sed 's/^[^\-]*\-//;s/\.zip$$//'`; \
|
||||
if [ -e ${MODEL_YML} ]; then \
|
||||
mv ${MODEL_YML} ${MODEL_YML}.${DATE}; \
|
||||
fi; \
|
||||
make DATE="$$d" release-yml; \
|
||||
fi
|
||||
|
||||
refresh-release-readme:
|
||||
if [[ ${DIST_PACKAGE} -nt ${MODEL_FINAL} ]]; then \
|
||||
echo "updating ${LANGPAIRSTR}/README.md for ${notdir ${shell realpath ${DIST_PACKAGE}}}"; \
|
||||
d=`realpath ${DIST_PACKAGE} | xargs basename | sed 's/^[^\-]*\-//;s/\.zip$$//'`; \
|
||||
if [ -e ${MODEL_README} ]; then \
|
||||
mv ${MODEL_README} ${MODEL_README}.${DATE}; \
|
||||
fi; \
|
||||
make DATE="$$d" release-readme; \
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##### ------------------------------------
|
||||
##### OLD release recipe: all in one
|
||||
##### ------------------------------------
|
||||
|
||||
|
||||
# ${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
# ifneq (${SKIP_DIST_EVAL},1)
|
||||
# @${MAKE} $(TEST_EVALUATION)
|
||||
# @${MAKE} $(TEST_COMPARISON)
|
||||
# endif
|
||||
# @mkdir -p ${dir $@}
|
||||
# @touch ${WORKDIR}/source.tcmodel
|
||||
# @cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.${SUBWORD_TYPE}
|
||||
# @cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.${SUBWORD_TYPE}
|
||||
# @cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
|
||||
# @cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
|
||||
# ##-----------------------------
|
||||
# ## create YML file
|
||||
# ##-----------------------------
|
||||
# @echo "release: ${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip" > ${@:.zip=}-${DATE}.yml
|
||||
# @echo "release-date: $(DATE)" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "dataset-name: $(DATASET)" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "modeltype: $(MODELTYPE)" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "subwords:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - source: ${PRE_SRC}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - target: ${PRE_TRG}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "subword-models:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - source: source.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo " - target: target.${SUBWORD_TYPE}" >> ${@:.zip=}-${DATE}.yml
|
||||
# ifdef USE_TARGET_LABELS
|
||||
# @echo "use-target-labels:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @for t in ${TRGLANGS}; do \
|
||||
# echo " - >>$$t<<" >> ${@:.zip=}-${DATE}.yml; \
|
||||
# done
|
||||
# endif
|
||||
# @echo "source-languages:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @for s in ${RAWSRCLANGS}; do\
|
||||
# echo " - $$s" >> ${@:.zip=}-${DATE}.yml; \
|
||||
# done
|
||||
# @echo "target-languages:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @for t in ${RAWTRGLANGS}; do\
|
||||
# echo " - $$t" >> ${@:.zip=}-${DATE}.yml; \
|
||||
# done
|
||||
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
# @echo "training-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
# tr "#" "\n" | grep '^ ${DATASET}~' | \
|
||||
# tail -1 | tr "~" "\n" | grep '^\* ' | \
|
||||
# grep -v ': *$$' | grep -v ' 0$$' | \
|
||||
# grep -v 'total size' | sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# endif
|
||||
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
# @echo "validation-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
# grep '^\* ' ${WORKDIR}/val/README.md | \
|
||||
# grep -v ' 0$$' | \
|
||||
# sed 's/^\* / - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## create README-file
|
||||
# ##-----------------------------
|
||||
# @echo "# $(notdir ${@:.zip=})-${DATE}.zip" > ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# @echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
|
||||
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
# @echo "* source language(s): ${RAWSRCLANGS}" >> ${WORKDIR}/README.md
|
||||
# @echo "* target language(s): ${RAWTRGLANGS}" >> ${WORKDIR}/README.md
|
||||
# @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
# @echo "* pre-processing: ${PREPROCESS_DESCRIPTION}" >> ${WORKDIR}/README.md
|
||||
# ifdef USE_TARGET_LABELS
|
||||
# echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' >> ${WORKDIR}/README.md
|
||||
# @echo "* valid language labels: ${LANGUAGELABELS}" >> ${WORKDIR}/README.md
|
||||
# endif
|
||||
# @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
||||
# ifneq (${SKIP_DATA_DETAILS},1)
|
||||
# ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
# @echo -n "## Training data: " >> ${WORKDIR}/README.md
|
||||
# @tr "\n" "~" < ${WORKDIR}/train/README.md |\
|
||||
# tr "#" "\n" | grep '${DATASET}' | \
|
||||
# tail -1 | tr "~" "\n" >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# endif
|
||||
# ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
# @echo -n "#" >> ${WORKDIR}/README.md
|
||||
# @cat ${WORKDIR}/val/README.md >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# endif
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## add benchmark results
|
||||
# ##-----------------------------
|
||||
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
# @echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md
|
||||
# @echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# @echo '## Benchmarks' >> ${WORKDIR}/README.md
|
||||
# @echo '' >> ${WORKDIR}/README.md
|
||||
# ## grep and normalise test set names
|
||||
# ## ugly perl script that does some tansformation of language codes
|
||||
# @grep -H BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# sed 's#^${WORKDIR}/\(.*\)\.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}\.\(.*\)\.eval:.*$$#\1.\2#' | \
|
||||
# perl -pe 'if (/\.([^\.]+)\.([^\.\s]+)$$/){$$s=$$1;$$t=$$2;s/[\-\.]$$s?\-?$$t\.$$s\.$$t?$$/.$$s.$$t/;s/\.$$s\.$$t$$/.$$s-$$t/}' > $@.1
|
||||
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f3 -d ' ' > $@.2
|
||||
# @grep chrF ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f3 -d ' ' > $@.3
|
||||
# @ls ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# sed 's/\.eval//' | xargs wc -l | grep -v total | sed 's/^ *//' | cut -f1 -d' ' > $@.4
|
||||
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f16 -d ' ' | sed 's/)//' > $@.5
|
||||
# @grep BLEU ${WORKDIR}/*.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | \
|
||||
# cut -f7 -d ' ' > $@.6
|
||||
# @paste -d '/' $@.4 $@.5 > $@.7
|
||||
# @echo '| testset | BLEU | chr-F | #sent | #words | BP |' >> ${WORKDIR}/README.md
|
||||
# @echo '|---------|-------|-------|-------|--------|----|' >> ${WORKDIR}/README.md
|
||||
# @paste $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 | \
|
||||
# sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | \
|
||||
# sort | uniq >> ${WORKDIR}/README.md
|
||||
# @echo "test-data:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "BLEU-scores:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# @echo "chr-F-scores:" >> ${@:.zip=}-${DATE}.yml
|
||||
# @paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> ${@:.zip=}-${DATE}.yml
|
||||
# @rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7 $@.testsize $@.testset
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## create the package
|
||||
# ##-----------------------------
|
||||
# @cat ${WORKDIR}/README.md >> ${dir $@}README.md
|
||||
# @echo '' >> ${dir $@}README.md
|
||||
# @cp models/LICENSE ${WORKDIR}/
|
||||
# @chmod +x ${WORKDIR}/preprocess.sh
|
||||
# @sed -e 's# - .*/\([^/]*\)$$# - \1#' \
|
||||
# -e 's/beam-size: [0-9]*$$/beam-size: 6/' \
|
||||
# -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \
|
||||
# -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \
|
||||
# -e 's/relative-paths: false/relative-paths: true/' \
|
||||
# < ${MODEL_DECODER} > ${WORKDIR}/decoder.yml
|
||||
# cd ${WORKDIR} && zip ${notdir $@} \
|
||||
# README.md LICENSE \
|
||||
# ${notdir ${MODEL_FINAL}} \
|
||||
# ${notdir ${MODEL_SRCVOCAB}} \
|
||||
# ${notdir ${MODEL_TRGVOCAB}} \
|
||||
# ${notdir ${MODEL_VALIDLOG}} \
|
||||
# ${notdir ${MODEL_TRAINLOG}} \
|
||||
# source.* target.* decoder.yml \
|
||||
# preprocess.sh postprocess.sh
|
||||
# ifneq ("$(wildcard ${WORKDIR}/${MODELCONFIG})","")
|
||||
# @cd ${WORKDIR} && zip -u ${notdir $@} ${MODELCONFIG}
|
||||
# endif
|
||||
# ##-----------------------------
|
||||
# ## move files to release dir and cleanup
|
||||
# ##-----------------------------
|
||||
# @mkdir -p ${dir $@}
|
||||
# @mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
|
||||
# @cd ${dir $@} && zip -u ${notdir ${@:.zip=}-${DATE}.zip} ${notdir ${@:.zip=}-${DATE}.yml}
|
||||
# ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
# @cp $(TEST_EVALUATION) ${@:.zip=}-${DATE}.eval.txt
|
||||
# @cp $(TEST_COMPARISON) ${@:.zip=}-${DATE}.test.txt
|
||||
# endif
|
||||
# @rm -f $@
|
||||
# @cd ${dir $@} && ln -s $(notdir ${@:.zip=})-${DATE}.zip ${notdir $@}
|
||||
# @rm -f ${WORKDIR}/decoder.yml ${WORKDIR}/source.* ${WORKDIR}/target.*
|
||||
# @rm -f ${WORKDIR}/preprocess.sh ${WORKDIR}/postprocess.sh
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## do this only if the flag is set
|
||||
|
@ -247,6 +247,9 @@ BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
|
||||
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
|
||||
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.yml
|
||||
|
||||
BT_MARIAN_EARLY_STOPPING = 15
|
||||
|
||||
|
||||
# %-add-backtranslations:
|
||||
%-bt:
|
||||
ifneq (${wildcard ${MODEL_FINAL}},)
|
||||
@ -256,8 +259,11 @@ ifeq (${wildcard ${BT_MODEL_START}},)
|
||||
endif
|
||||
endif
|
||||
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
|
||||
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 \
|
||||
MARIAN_EARLY_STOPPING=15 \
|
||||
${MAKE} DATASET=${DATASET}+bt \
|
||||
USE_BACKTRANS=1 \
|
||||
CONTINUE_EXISTING=1 \
|
||||
MODELCONFIG=config-bt.mk \
|
||||
MARIAN_EARLY_STOPPING=${BT_MARIAN_EARLY_STOPPING} \
|
||||
${@:-bt=}
|
||||
|
||||
# CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
||||
@ -280,6 +286,7 @@ endif
|
||||
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
|
||||
${MAKE} DATASET=${DATASET}+pivot \
|
||||
USE_PIVOTING=1 \
|
||||
CONTINUE_EXISTING=1 \
|
||||
MARIAN_EARLY_STOPPING=10 \
|
||||
${@:-pivot=}
|
||||
|
||||
|
@ -1,6 +1,64 @@
|
||||
# -*-makefile-*-
|
||||
|
||||
|
||||
|
||||
FIU2XXX = $(wildcard models-tatoeba/fiu-???)
|
||||
XXX2FIU = $(wildcard models-tatoeba/???-fiu)
|
||||
|
||||
|
||||
fiu2xxx-print-results:
|
||||
@for d in ${FIU2XXX}; do \
|
||||
s='fiu';\
|
||||
t=`echo $$d | cut -f3 -d'-'`;\
|
||||
echo '\begin{table}[]'; \
|
||||
echo '\centering'; \
|
||||
echo '\begin{tabular}{|c|cc|}'; \
|
||||
echo '\hline'; \
|
||||
echo "$$s-$$t & chr-F2 & BLEU \\\\"; \
|
||||
echo '\hline'; \
|
||||
cat $$d/README.md |\
|
||||
tr "\n#" "~\n" | tail -1 | tr '~' "\n" |\
|
||||
grep 'Tatoeba-test' | \
|
||||
sed 's/Tatoeba-test\.//' |\
|
||||
perl -e 'while (<>){@a=split(/\s*\|\s*/);print if ($$a[4]>=100);}' |\
|
||||
cut -f2-4 -d'|' | tr '|' '&' | sed 's/$$/\\\\/'; \
|
||||
echo '\end{tabular}'; \
|
||||
echo -n '\caption{Results from the multilingual translation model between Finno-Ugric languages and '; \
|
||||
iso639 $$t | tr '"' ' '; \
|
||||
echo 'measured on the Tatoeba test set.}'; \
|
||||
echo '\label{tab:my_label}'; \
|
||||
echo '\end{table}'; \
|
||||
echo ""; \
|
||||
done
|
||||
|
||||
|
||||
xxx2fiu-print-results:
|
||||
@for d in ${XXX2FIU}; do \
|
||||
t='fiu';\
|
||||
s=`echo $$d | cut -f2 -d'/' | cut -f1 -d'-'`;\
|
||||
echo '\begin{table}[]'; \
|
||||
echo '\centering'; \
|
||||
echo '\begin{tabular}{|c|cc|}'; \
|
||||
echo '\hline'; \
|
||||
echo "$$s-$$t & chr-F2 & BLEU \\\\"; \
|
||||
echo '\hline'; \
|
||||
cat $$d/README.md |\
|
||||
tr "\n#" "~\n" | tail -1 | tr '~' "\n" |\
|
||||
grep 'Tatoeba-test' | \
|
||||
sed 's/Tatoeba-test\.//' |\
|
||||
perl -e 'while (<>){@a=split(/\s*\|\s*/);print if ($$a[4]>=100);}' |\
|
||||
cut -f2-4 -d'|' | tr '|' '&' | sed 's/$$/\\\\/'; \
|
||||
echo '\end{tabular}'; \
|
||||
echo -n '\caption{Results from the multilingual translation model between '; \
|
||||
iso639 $$s | tr '"' ' '; \
|
||||
echo 'and Finno-Ugric languages measured on the Tatoeba test set.}'; \
|
||||
echo '\label{tab:my_label}'; \
|
||||
echo '\end{table}'; \
|
||||
echo ""; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
# FIU_DATASIZE = -1m
|
||||
|
||||
train-tatoeba-crossfiu: train-tatoeba-group2fiu train-tatoeba-fiu2group
|
||||
|
File diff suppressed because it is too large
Load Diff
87
scripts/pivot-bt.pl
Normal file
87
scripts/pivot-bt.pl
Normal file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
|
||||
|
||||
use strict;
|
||||
use open qw/:std :utf8/;
|
||||
use Getopt::Long;
|
||||
|
||||
my $AlphaOnly = 0;
|
||||
my $WordOnly = 1;
|
||||
my $LowerCase = 1;
|
||||
my $verbose = 0;
|
||||
|
||||
my @SrcFiles = ();
|
||||
my @SrcPivotFiles = ();
|
||||
my @TrgPivotFiles = ();
|
||||
my @TrgFiles = ();
|
||||
|
||||
GetOptions(
|
||||
"srcfiles|s=s{,}" => \@SrcFiles,
|
||||
"srcpivotfiles|p1=s{,}" => \@SrcPivotFiles,
|
||||
"trgpivotfiles|p2=s{,}" => \@TrgPivotFiles,
|
||||
"trgfiles|t=s{,}" => \@TrgFiles,
|
||||
"alpha|a" => \$AlphaOnly,
|
||||
"word|w" => \$WordOnly,
|
||||
"lower-case|l" => \$LowerCase,
|
||||
"verbose|v" => \$verbose );
|
||||
|
||||
|
||||
my %pivot2src = ();
|
||||
|
||||
while (@SrcFiles){
|
||||
my $srcfile = shift(@SrcFiles);
|
||||
my $srcpivot = shift(@SrcPivotFiles);
|
||||
|
||||
print STDERR "read $srcfile $srcpivot ...\n";
|
||||
open S,"gzip -cd <$srcfile |" || die "cannot read from $srcfile";
|
||||
open T,"gzip -cd <$srcpivot |" || die "cannot read from $srcpivot";
|
||||
|
||||
while (<S>){
|
||||
chomp;
|
||||
my $trg = <T>;
|
||||
chomp($trg);
|
||||
my $key = make_key($trg);
|
||||
$pivot2src{$key} = $_ if ($key);
|
||||
}
|
||||
close S;
|
||||
close T;
|
||||
}
|
||||
|
||||
|
||||
while (@TrgFiles){
|
||||
my $trgfile = shift(@TrgFiles);
|
||||
my $trgpivot = shift(@TrgPivotFiles);
|
||||
|
||||
print STDERR "checking $trgfile $trgpivot ...\n";
|
||||
open S,"gzip -cd <$trgpivot |" || die "cannot read from $trgpivot";
|
||||
open T,"gzip -cd <$trgfile |" || die "cannot read from $trgfile";
|
||||
while (<S>){
|
||||
chomp;
|
||||
my $trg = <T>;
|
||||
chomp($trg);
|
||||
my $key = make_key($_);
|
||||
next unless ($key);
|
||||
if (exists $pivot2src{$key}){
|
||||
print $pivot2src{$key},"\t",$trg,"\n";
|
||||
print STDERR "matching key '$key'\n" if ($verbose);
|
||||
}
|
||||
}
|
||||
close S;
|
||||
close T;
|
||||
}
|
||||
|
||||
|
||||
sub make_key{
|
||||
my $string = shift;
|
||||
if ($AlphaOnly){
|
||||
$string=~s/\P{IsAlpha}//gs;
|
||||
}
|
||||
if ($WordOnly){
|
||||
$string=~s/\P{IsWord}//gs;
|
||||
}
|
||||
if ($LowerCase){
|
||||
$string=lc($string);
|
||||
}
|
||||
return $string;
|
||||
}
|
1
testsets/en-de/newstest2014-deen.deu.gz
Symbolic link
1
testsets/en-de/newstest2014-deen.deu.gz
Symbolic link
@ -0,0 +1 @@
|
||||
newstest2014-deen.de.gz
|
1
testsets/en-de/newstest2014-deen.eng.gz
Symbolic link
1
testsets/en-de/newstest2014-deen.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
newstest2014-deen.en.gz
|
Loading…
Reference in New Issue
Block a user