From 596cae8922e92b6e798330ab089064d6f5af94b9 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Sat, 18 Jan 2020 20:37:01 +0200 Subject: [PATCH] train with backtranslations --- Makefile | 218 ++------------------------------- Makefile.config | 5 + Makefile.data | 4 + Makefile.dist | 6 - Makefile.doclevel | 43 +++++++ Makefile.generic | 174 ++++++++++++++++++++++++++ html/index.php | 111 +++++++++++++++++ preprocess-bpe-multi-target.sh | 0 preprocess-spm-multi-target.sh | 0 work-spm/eval/scores.txt | 105 +++++++++++++++- 10 files changed, 448 insertions(+), 218 deletions(-) create mode 100644 Makefile.generic create mode 100644 html/index.php mode change 100644 => 100755 preprocess-bpe-multi-target.sh mode change 100644 => 100755 preprocess-spm-multi-target.sh diff --git a/Makefile b/Makefile index 7a5ea054..c11d92ff 100644 --- a/Makefile +++ b/Makefile @@ -91,6 +91,7 @@ include Makefile.dist include Makefile.tasks include Makefile.data include Makefile.doclevel +include Makefile.generic include Makefile.slurm @@ -133,17 +134,6 @@ translate-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${ eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval -## resume training on an existing model -resume: - if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \ - cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \ - ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \ - fi - sleep 1 - rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done - ${MAKE} train - - #------------------------------------------------------------------------ # translate and evaluate all test sets in testsets/ #------------------------------------------------------------------------ @@ -152,7 +142,6 @@ resume: ## and all trokenized test sets that can be found in that directory TESTSET_HOME = ${PWD}/testsets TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG} -# TESTSETS = $(patsubst ${TESTSET_DIR}/%.${SRC}.tok.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.tok.gz}) TESTSETS = $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz}) TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})}) TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})}) @@ -190,197 +179,17 @@ finished: fi -## extension -all: run something over all language pairs, e.g. -## make wordalign-all -## this goes sequentially over all language pairs -## for the parallelizable version of this: look at %-all-parallel -%-all: - for l in ${ALL_LANG_PAIRS}; do \ - ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ - TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \ - done - -# run something over all language pairs that have trained models -## - make eval-allmodels -## - make dist-allmodels -%-allmodels: - for l in ${ALL_LANG_PAIRS}; do \ - if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \ - ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ - TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \ - fi \ - done - -## only bilingual models -%-allbilingual: - for l in ${ALL_BILINGUAL_MODELS}; do \ - if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \ - ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ - TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \ - fi \ - done - -## only bilingual models -%-allmultilingual: - for l in ${ALL_MULTILINGUAL_MODELS}; do \ - if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \ - ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ - TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \ - fi \ - done - - -## run something over all language pairs but make it possible to do it in parallel, for example -## - make dist-all-parallel -%-all-parallel: - ${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}}) - -## run a command that includes the langpair, for example -## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv" -## What is this good for? -## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially -%-run-for-langpair: - ${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \ - TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \ - ${shell echo $@ | sed 's/__.*$$//'} - - -## right-to-left model -%-RL: - ${MAKE} MODEL=${MODEL}-RL \ - MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \ - ${@:-RL=} - - -## run a multigpu job (2 or 4 GPUs) - -%-multigpu %-0123: - ${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=}) - -%-twogpu %-gpu01: - ${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=}) - -%-gpu23: - ${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=} - - -## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...) -%-cpu: - ${MAKE} MARIAN=${MARIANCPU} \ - LOADMODS='${LOADCPU}' \ - MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \ - ${@:-cpu=} - - -## document level models -%-doc: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \ - PRE=norm \ - PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \ - PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \ - ${@:-doc=} - - -## sentence-piece models -%-spm: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \ - PRE=norm \ - PRE_SRC=spm${SRCBPESIZE:000=}k \ - PRE_TRG=spm${TRGBPESIZE:000=}k \ - ${@:-spm=} - -%-spm-noalign: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \ - MODELTYPE=transformer \ - PRE=norm \ - PRE_SRC=spm${SRCBPESIZE:000=}k \ - PRE_TRG=spm${TRGBPESIZE:000=}k \ - ${@:-spm-noalign=} - - - -## BPE models -%-bpe: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \ - PRE=tok \ - MODELTYPE=transformer \ - PRE_SRC=bpe${SRCBPESIZE:000=}k \ - PRE_TRG=bpe${TRGBPESIZE:000=}k \ - ${@:-bpe=} - -%-bpe-align: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \ - PRE=tok \ - PRE_SRC=bpe${SRCBPESIZE:000=}k \ - PRE_TRG=bpe${TRGBPESIZE:000=}k \ - ${@:-bpe-align=} - -%-bpe-memad: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \ - PRE=tok \ - MODELTYPE=transformer \ - PRE_SRC=bpe${SRCBPESIZE:000=}k \ - PRE_TRG=bpe${TRGBPESIZE:000=}k \ - ${@:-bpe-memad=} - -%-bpe-old: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \ - PRE=tok \ - MODELTYPE=transformer \ - PRE_SRC=bpe${SRCBPESIZE:000=}k \ - PRE_TRG=bpe${TRGBPESIZE:000=}k \ - ${@:-bpe-old=} - - -## for the inbuilt sentence-piece segmentation: -# PRE_SRC=txt PRE_TRG=txt -# MARIAN=${MARIAN}-spm -# MODEL_VOCABTYPE=spm - - - - - - - - -## continue document-level training with a new context size - -ifndef NEW_CONTEXT - NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE))) -endif - -continue-doctrain: - mkdir -p ${WORKDIR}/${MODEL} - cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}}) - cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz - ${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc - - - - -## continue training with a new dataset - -ifndef NEW_DATASET - NEW_DATASET = OpenSubtitles -endif - -continue-datatrain: - mkdir -p ${WORKDIR}/${MODEL} - cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}}) - cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz - if [ -e ${BPESRCMODEL} ]; then \ - cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \ - cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \ +## resume training on an existing model +resume: + if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \ + cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \ + ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \ fi - if [ -e ${SPMSRCMODEL} ]; then \ - cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \ - cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \ - fi - ${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train + sleep 1 + rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done + ${MAKE} train -# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus" @@ -548,13 +357,6 @@ endif rm -f $@.input $@.output -# %.eval: % ${TEST_TRG} -# grep . ${TEST_TRG} > $@.ref -# grep . $< > $@.sys -# cat $@.sys | sacrebleu $@.ref > $@ -# cat $@.sys | sacrebleu --metrics=chrf --width=3 $@.ref >> $@ -# rm -f $@.ref $@.sys - %.eval: % ${TEST_TRG} paste ${TEST_SRC}.${PRE_SRC} ${TEST_TRG} | grep $$'.\t' | cut -f2 > $@.ref @@ -575,5 +377,3 @@ endif -e 's/&/&/g' |\ sed 'n;n;G;' > $@ rm -f $@.1 $@.2 $@.3 - -# paste -d "\n" ${TEST_SRC} ${TEST_TRG} ${<:.eval=} |\ diff --git a/Makefile.config b/Makefile.config index 9857f584..74a9a383 100644 --- a/Makefile.config +++ b/Makefile.config @@ -215,6 +215,11 @@ MODEL_VOCABTYPE = yml MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE} MODEL_DECODER = ${MODEL_FINAL}.decoder.yml +## OPUS model (in case we want to continue training with other data) +OPUSMODEL = ${MODEL_SUBDIR}opus${TRAINSIZE}.${PRE_SRC}-${PRE_TRG} +OPUSMODEL_BASE = ${OPUSMODEL}.${MODELTYPE}.model${NR} +OPUSMODEL_FINAL = ${WORKDIR}/${OPUSMODEL_BASE}.npz.best-perplexity.npz + ## test set translation and scores diff --git a/Makefile.data b/Makefile.data index f8020bba..ac944d33 100644 --- a/Makefile.data +++ b/Makefile.data @@ -31,6 +31,10 @@ DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_TUNE_TRG} ${CLEAN_DEV_TRG} ${CLEAN +BACKTRANS_DIR = backtranslate/${TRG}-${SRC} + +BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRC}.gz}} +BACKTRANS_TRG = ${patsubst %.${SRC}.gz,%.${TRG}.gz,${BACKTRANS_SRC}} ## make data in reverse direction without re-doing word alignment etc ... ## ---> this is dangerous when things run in parallel diff --git a/Makefile.dist b/Makefile.dist index 7bc77781..8c89453f 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -125,12 +125,6 @@ endif endif -ttt: - @echo ${PREPROCESS_SRCMODEL} - @echo ${PREPROCESS_TRGMODEL} - @echo ${PREPROCESS_SCRIPT} - @echo ${POSTPROCESS_SCRIPT} - ${DIST_PACKAGE}: ${MODEL_FINAL} ifneq (${SKIP_DIST_EVAL},1) diff --git a/Makefile.doclevel b/Makefile.doclevel index 376ec628..dffeca55 100644 --- a/Makefile.doclevel +++ b/Makefile.doclevel @@ -4,6 +4,49 @@ DOCLEVEL_BENCHMARK_DATA = https://zenodo.org/record/3525366/files/doclevel-MT-benchmark-discomt2019.zip + +## continue document-level training with a new context size + +ifndef NEW_CONTEXT + NEW_CONTEXT = $$(($(CONTEXT_SIZE) + $(CONTEXT_SIZE))) +endif + +continue-doctrain: + mkdir -p ${WORKDIR}/${MODEL} + cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},${notdir ${MODEL_VOCAB}}) + cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(subst .doc${CONTEXT_SIZE},.doc${NEW_CONTEXT},$(notdir ${MODEL_BASENAME})).npz + ${MAKE} MODEL_SUBDIR=${MODEL}/ CONTEXT_SIZE=$(NEW_CONTEXT) train-doc + + + + +## continue training with a new dataset + +ifndef NEW_DATASET + NEW_DATASET = OpenSubtitles +endif + +continue-datatrain: + mkdir -p ${WORKDIR}/${MODEL} + cp ${MODEL_VOCAB} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${notdir ${MODEL_VOCAB}}) + cp ${MODEL_FINAL} ${WORKDIR}/${MODEL}/$(patsubst ${DATASET}%,${NEW_DATASET}%,${MODEL_BASENAME}).npz + if [ -e ${BPESRCMODEL} ]; then \ + cp ${BPESRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPESRCMODEL}); \ + cp ${BPETRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${BPETRGMODEL}); \ + fi + if [ -e ${SPMSRCMODEL} ]; then \ + cp ${SPMSRCMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMSRCMODEL}); \ + cp ${SPMTRGMODEL} $(patsubst ${WORKDIR}/train/${DATASET}%,${WORKDIR}/train/${NEW_DATASET}%,${SPMTRGMODEL}); \ + fi + ${MAKE} MODEL_SUBDIR=${MODEL}/ DATASET=$(NEW_DATASET) train + + +# MARIAN_EXTRA="${MARIAN_EXTRA} --no-restore-corpus" + + + + + ## use the doclevel benchmark data sets %-ost: ${MAKE} ost-datasets diff --git a/Makefile.generic b/Makefile.generic new file mode 100644 index 00000000..29f1c19e --- /dev/null +++ b/Makefile.generic @@ -0,0 +1,174 @@ +# -*-makefile-*- +# +# generic implic targets that make our life a bit easier + + + + +## extension -all: run something over all language pairs, e.g. +## make wordalign-all +## this goes sequentially over all language pairs +## for the parallelizable version of this: look at %-all-parallel +%-all: + for l in ${ALL_LANG_PAIRS}; do \ + ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ + TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \ + done + +# run something over all language pairs that have trained models +## - make eval-allmodels +## - make dist-allmodels +%-allmodels: + for l in ${ALL_LANG_PAIRS}; do \ + if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \ + ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ + TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \ + fi \ + done + +## only bilingual models +%-allbilingual: + for l in ${ALL_BILINGUAL_MODELS}; do \ + if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \ + ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ + TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \ + fi \ + done + +## only bilingual models +%-allmultilingual: + for l in ${ALL_MULTILINGUAL_MODELS}; do \ + if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.npz' | wc -l` -gt 0 ]; then \ + ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ + TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \ + fi \ + done + + +## run something over all language pairs but make it possible to do it in parallel, for example +## - make dist-all-parallel +%-all-parallel: + ${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}}) + +## run a command that includes the langpair, for example +## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv" +## What is this good for? +## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially +%-run-for-langpair: + ${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \ + TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \ + ${shell echo $@ | sed 's/__.*$$//'} + + +## right-to-left model +%-RL: + ${MAKE} MODEL=${MODEL}-RL \ + MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \ + ${@:-RL=} + + + +## include all backtranslation data as well in training +## start from the pre-trained opus model if it exists + +%-add-backtranslations: +ifneq (${wildcard ${OPUSMODEL_FINAL}},) + cp ${OPUSMODEL_FINAL} ${MODEL_BASENAME}.gz +endif + ${MAKE} DATASET=opus+bt \ + CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \ + CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \ + ${@:-add-backtranslations=} + + + + + +## run a multigpu job (2 or 4 GPUs) + +%-multigpu %-0123: + ${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=}) + +%-twogpu %-gpu01: + ${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=}) + +%-gpu23: + ${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=} + + +## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...) +%-cpu: + ${MAKE} MARIAN=${MARIANCPU} \ + LOADMODS='${LOADCPU}' \ + MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \ + ${@:-cpu=} + + +## document level models +%-doc: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \ + PRE=norm \ + PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \ + PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \ + ${@:-doc=} + + +## sentence-piece models +%-spm: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \ + PRE=norm \ + PRE_SRC=spm${SRCBPESIZE:000=}k \ + PRE_TRG=spm${TRGBPESIZE:000=}k \ + ${@:-spm=} + +%-spm-noalign: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \ + MODELTYPE=transformer \ + PRE=norm \ + PRE_SRC=spm${SRCBPESIZE:000=}k \ + PRE_TRG=spm${TRGBPESIZE:000=}k \ + ${@:-spm-noalign=} + + + +## BPE models +%-bpe: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \ + PRE=tok \ + MODELTYPE=transformer \ + PRE_SRC=bpe${SRCBPESIZE:000=}k \ + PRE_TRG=bpe${TRGBPESIZE:000=}k \ + ${@:-bpe=} + +%-bpe-align: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \ + PRE=tok \ + PRE_SRC=bpe${SRCBPESIZE:000=}k \ + PRE_TRG=bpe${TRGBPESIZE:000=}k \ + ${@:-bpe-align=} + +%-bpe-memad: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \ + PRE=tok \ + MODELTYPE=transformer \ + PRE_SRC=bpe${SRCBPESIZE:000=}k \ + PRE_TRG=bpe${TRGBPESIZE:000=}k \ + ${@:-bpe-memad=} + +%-bpe-old: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \ + PRE=tok \ + MODELTYPE=transformer \ + PRE_SRC=bpe${SRCBPESIZE:000=}k \ + PRE_TRG=bpe${TRGBPESIZE:000=}k \ + ${@:-bpe-old=} + + +## for the inbuilt sentence-piece segmentation: +# PRE_SRC=txt PRE_TRG=txt +# MARIAN=${MARIAN}-spm +# MODEL_VOCABTYPE=spm + + + + diff --git a/html/index.php b/html/index.php new file mode 100644 index 00000000..80fe206b --- /dev/null +++ b/html/index.php @@ -0,0 +1,111 @@ +"); +echo("

Pre-trained Opus-MT Models

'); + + +echo("

Multilingual models

'); + + +echo("

Language pairs