started tutorial and fixes to backtranslate makefile

This commit is contained in:
Jörg Tiedemann 2020-09-05 00:16:22 +03:00
parent d11f74ce41
commit ad828c3124
9 changed files with 146 additions and 51 deletions

View File

@ -370,8 +370,6 @@ translate-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemb
eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
## combined tasks:

View File

@ -4,6 +4,9 @@
# only works with sentencepiece models!
#
PWD := ${shell pwd}
TOOLSDIR := ${PWD}/../tools
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
@ -41,7 +44,7 @@ MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-langid/models/${LANGPAIR}
MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
@ -61,12 +64,15 @@ ifdef LOCAL_SCRATCH
endif
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
ifeq (${shell hostname --domain 2>/dev/null},bullx)
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
endif
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
ifneq (${wildcard index.html},)
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
endif
@ -93,7 +99,9 @@ PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard
.PRECIOUS: ${WIKI_TRG}
## find wiki downloads
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
ifneq (${wildcard index.html},)
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
endif
## we don't need to keep the json file
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}
@ -104,15 +112,16 @@ ifndef UDPIPE_MODELS
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
endif
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
LANGNAME = ${shell ${LOAD_MODULES} ${ISO639} -n ${LANGID} | sed 's/"//g' | \
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
ifeq (${LANGNAME},)
LANGNAME = xx
endif
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
ifneq (${wildcard ${UDPIPE_MODELS}},)
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
endif
all: index.html
@ -120,6 +129,18 @@ all: index.html
${MAKE} ${WIKI_LATEST_SRC}
fetch-wiki fetch:
mkdir -p wiki
wget -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
tar -C wiki -xf wiki/${SRC}.tar
rm -f wiki/${SRC}.tar
fetch-wikidoc:
mkdir -p wikidoc
wget -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
tar -C wikidoc -xf wikidoc/${SRC}.tar
rm -f wikidoc/${SRC}.tar
## tatoeba = tatoeba-monolingual data and tatoeba-models
## TODO: should we loop over all labels?

1
backtranslate/marian-dev Symbolic link
View File

@ -0,0 +1 @@
../tools/marian-dev

1
backtranslate/mosesdecoder Symbolic link
View File

@ -0,0 +1 @@
../tools/moses-scripts

View File

@ -1,12 +1,6 @@
# OPUS-MT-train tutorial
This tutorial goes through some common tasks with the example of training models to translate from Breton to English. First of all, clone the repository from github:
```
git clone git@github.com:Helsinki-NLP/OPUS-MT-train.git
cd OPUS-MT-train
```
This tutorial goes through some common tasks with the example of training models to translate from English Breton to English. We assume that you have a working setup of all tools required. Check the [installation documentation](../Setup.md) for further information.
## Basic configuration and data sets
@ -15,23 +9,98 @@ cd OPUS-MT-train
* create a local configuration file with language-specific settings
```
make SRCLANGS=br TRGLANGS=en local-config
make SRCLANGS=en TRGLANGS=br local-config
```
* create data sets, subword segmentation models, word alignments and model vocabulary
* create data sets, subword segmentation models and NMT vocabulary
```
make SRCLANGS=br TRGLANGS=en data
make SRCLANGS=en TRGLANGS=br data
```
This will also download the necessary files if they don't exist on the local file system. It will train sentence piece models for each language separately and apply the model to all data sets. Finally, it also creates the vocabulary file from the segmented training data.
## Train the model
Training the model requires a GPU. Run this directly on a machine with appropriate hardware and CUDA libraries installed or submit a job to some GPU nodes on a cluster.
```
make SRCLANGS=en TRGLANGS=br train
```
Depending on the size of the data this will take hours, days or weeks to finish. The stopping criterion is set to 10 subsequent non-improved validation scores on validation data. For en-br this will take 1-2 hours. For submitting jobs have a look at the [documentation for batch jobs](BatchJobs.md).
Traininng can always be resumed in case the process crashes for some reason using the same command as above.
## Evaluate the model
Evaluation can be done at any time there is a model from one of the validation steps or the final model after convergence. Running the translation and evaluation of the given test set is done by calling:
```
make SRCLANGS=en TRGLANGS=br translate
make SRCLANGS=en TRGLANGS=br eval
```
Translation runs, naturally, faster on a GPU but can also be done in reasonable time on CPU cores. You can add cores by setting the `THREADS` variable. Evaluation is done using sacrebleu and translations as wells as BLEU/chrF2 scores ars stored in the work directory, in this case in
```
work/en-br/Tatoeba.opus.spm4k-spm4k1.transformer.en.br
work/en-br/Tatoeba.opus.spm4k-spm4k1.transformer.en.br.eval
```
There is also a recipe for merging input, reference translation and system output into one file for better readability:
```
make SRCLANGS=en TRGLANGS=br compare
less work/en-br/Tatoeba.opus.spm4k-spm4k1.transformer.en.br.compare
```
Translation and evaluation files will be overwritten when a new model appears and the evaluation target is called again.
## Generate back-translations
Back-translation requires a moel in the opposite direction. First thing to do is to reverse the data. This can be done without generating them from scratch:
```
make SRCLANGS=en TRGLANGS=br reverse-data
```
Now train a new model but in the opposite direction:
```
make SRCLANGS=br TRGLANGS=en train
```
After training we need to create a package to be used by back-translation. Run this to create a package in `work/models/br-en`:
```
make SRCLANGS=br TRGLANGS=en dist
```
The next step is to fetch some monolingual data to be back-translated. OPUS-MT is prepared to use Wiki data from various Wikimedia wikis (Wikipedia, Wikiquote, Wikisource, Wikibooks, Wikinews). You can fetch the prepared data sets by running:
```
make -C backtranslation SRC=br fetch-wikidoc
```
Finally, we can translate the Breton Wikipedia to English using the br-en model we have trained above (run this on a GPU machine):
```
make -C backtranslation SRC=br translate
```
The translations are most probably really bad as the back-translation model is vry poor (around 4 BLEU).
## Generate pivot-based translations
## Re-train using back-translations and pivot translations

View File

@ -183,8 +183,6 @@ url-exists = ${shell if [ "${call url-status,${1}}" == "HTTP/1.1 200 OK" ]; then
resource-url = ${shell echo "${OPUS_STORE}${3}/${call get-opus-version,${1},${2},${3}}/moses/${1}-${2}.txt.zip"}
## exclude certain data sets
# EXCLUDE_CORPORA ?= WMT-News MPC1 ${call get-elra-bitexts,${SRC},${TRG}}
EXCLUDE_CORPORA ?= WMT-News MPC1

View File

@ -144,9 +144,15 @@ ifeq (${words ${TRGLANGS}},1)
-if [ -e ${MODEL_VOCAB} ]; then \
ln -s ${MODEL_VOCAB} ${REV_WORKDIR}/${notdir ${MODEL_VOCAB}}; \
fi
##
## this is a bit dangerous with some trick to
## swap parameters between SRC and TRG
##
-if [ -e ${WORKDIR}/config.mk ]; then \
if [ ! -e ${REV_WORKDIR}/config.mk ]; then \
cp ${WORKDIR}/config.mk ${REV_WORKDIR}/config.mk; \
cat ${WORKDIR}/config.mk |\
sed -e 's/SRC/TTT/g;s/TRG/SRC/g;s/TTT/TRG/' |\
grep -v LANGPAIRSTR > ${REV_WORKDIR}/config.mk; \
fi \
fi
endif
@ -155,6 +161,8 @@ endif
clean-data:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \

View File

@ -5,6 +5,7 @@
# - system-specific settings
#
SHELL := /bin/bash
## modules to be loaded in sbatch scripts
@ -52,6 +53,7 @@ WORKHOME = ${PWD}/work
ifeq (${shell hostname},dx6-ibs-p2)
GPU = pascal
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work}
# OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
@ -61,18 +63,13 @@ ifeq (${shell hostname},dx6-ibs-p2)
# MARIAN = ${APPLHOME}/marian/build
# SUBWORD_HOME = ${APPLHOME}/subword-nmt/subword_nmt
else ifeq (${shell hostname},dx7-nkiel-4gpu)
GPU = pascal
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work}
MARIAN_BUILD_OPTIONS += -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-9.2 \
-DPROTOBUF_LIBRARY=${PWD}/tools/protobuf/lib/libprotobuf.so \
-DPROTOBUF_INCLUDE_DIR=${PWD}/tools/protobuf/include/google/protobuf \
-DPROTOBUF_PROTOC_EXECUTABLE=${PWD}/tools/protobuf/bin/protoc
MARIAN_BUILD_OPTIONS += -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-9.2
# -DPROTOBUF_LIBRARY=/usr/lib/x86_64-linux-gnu/libprotobuf.so.9 \
# -DPROTOBUF_INCLUDE_DIR=/usr/include/google/protobuf \
# -DPROTOBUF_PROTOC_EXECUTABLE=${PWD}/tools/protobuf/src/protoc
# -DPROTOBUF_LIBRARY=${PWD}/tools/protobuf/src/libprotobuf.la \
# -DPROTOBUF_INCLUDE_DIR=${PWD}/tools/protobuf/src/google/ \
# -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc
# OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
# MOSESHOME = ${APPLHOME}/mosesdecoder
# MOSESSCRIPTS = ${MOSESHOME}/scripts
@ -117,23 +114,24 @@ TMPDIR ?= /tmp
## tools and their locations
SCRIPTDIR ?= ${PWD}/scripts
TOOLSDIR ?= ${PWD}/tools
ISO639 ?= ${shell which iso639 || echo 'perl ${PWD}/tools/LanguageCodes/ISO-639-3/bin/iso639'}
PIGZ ?= ${shell which pigz || echo ${PWD}/tools/pigz/pigz}
TERASHUF ?= ${shell which terashuf || echo ${PWD}/tools/terashuf/terashuf}
JQ ?= ${shell which jq || echo ${PWD}/tools/jq/jq}
PROTOC ?= ${shell which protoc || echo ${PWD}/tools/protobuf/bin/protoc}
MARIAN ?= ${shell which marian || echo ${PWD}/tools/marian-dev/build/marian}
ISO639 ?= ${shell which iso639 || echo 'perl ${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639'}
PIGZ ?= ${shell which pigz || echo ${TOOLSDIR}/pigz/pigz}
TERASHUF ?= ${shell which terashuf || echo ${TOOLSDIR}/terashuf/terashuf}
JQ ?= ${shell which jq || echo ${TOOLSDIR}/jq/jq}
PROTOC ?= ${shell which protoc || echo ${TOOLSDIR}/protobuf/bin/protoc}
MARIAN ?= ${shell which marian || echo ${TOOLSDIR}/marian-dev/build/marian}
MARIAN_HOME ?= $(dir ${MARIAN})
SPM_HOME ?= ${dir ${MARIAN}}
FASTALIGN ?= ${shell which fast_align || echo ${PWD}/tools/fast_align/build/fast_align}
FASTALIGN ?= ${shell which fast_align || echo ${TOOLSDIR}/fast_align/build/fast_align}
FASTALIGN_HOME ?= ${dir ${FASTALIGN}}
ATOOLS ?= ${FASTALIGN_HOME}atools
EFLOMAL ?= ${shell which eflomal || echo ${PWD}/tools/eflomal/eflomal}
EFLOMAL ?= ${shell which eflomal || echo ${TOOLSDIR}/eflomal/eflomal}
EFLOMAL_HOME ?= ${dir ${EFLOMAL}}
WORDALIGN ?= ${EFLOMAL_HOME}align.py
EFLOMAL ?= ${EFLOMAL_HOME}eflomal
MOSESSCRIPTS ?= ${PWD}/tools/moses-scripts/scripts
MOSESSCRIPTS ?= ${TOOLSDIR}/moses-scripts/scripts
## marian-nmt binaries
@ -149,7 +147,7 @@ TOKENIZER = ${MOSESSCRIPTS}/tokenizer
## BPE
SUBWORD_BPE ?= ${shell which subword-nmt || echo ${PWD}/tools/subword-nmt/subword_nmt/subword_nmt.py}
SUBWORD_BPE ?= ${shell which subword-nmt || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
SUBWORD_HOME ?= ${dir ${SUBWORD_BPE}}
ifeq (${shell which subword-nmt},)
BPE_LEARN ?= pyhton3 ${SUBWORD_HOME}/learn_bpe.py
@ -209,24 +207,24 @@ install-prerequisites install-prereq install-requirements:
${MAKE} ${PREREQ_TOOLS}
${PWD}/tools/LanguageCodes/ISO-639-3/bin/iso639:
${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639:
${MAKE} tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm
${PWD}/tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm:
${TOOLSDIR}/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm:
${MAKE} -C tools/LanguageCodes all
${PWD}/tools/fast_align/build/atools:
${TOOLSDIR}/fast_align/build/atools:
mkdir -p ${dir $@}
cd ${dir $@} && cmake ..
${MAKE} -C ${dir $@}
${PWD}/tools/pigz/pigz:
${TOOLSDIR}/pigz/pigz:
${MAKE} -C ${dir $@}
${PWD}/tools/terashuf/terashuf:
${TOOLSDIR}/terashuf/terashuf:
${MAKE} -C ${dir $@}
${PWD}/tools/jq/jq:
${TOOLSDIR}/jq/jq:
cd ${dir $@} && git submodule update --init
cd ${dir $@} && autoreconf -fi
cd ${dir $@} && ./configure --with-oniguruma=builtin
@ -237,16 +235,16 @@ ${PWD}/tools/jq/jq:
## - install MKL (especially for cpu use):
## file:///opt/intel/documentation_2020/en/mkl/ps2020/get_started.htm
${PWD}/tools/marian-dev/build/marian: ${PROTOC}
${TOOLSDIR}/marian-dev/build/marian: ${PROTOC}
mkdir -p ${dir $@}
cd ${dir $@} && cmake -DUSE_SENTENCEPIECE=on ${MARIAN_BUILD_OPTIONS} ..
${MAKE} -C ${dir $@} -j
${PWD}/tools/protobuf/bin/protoc:
${TOOLSDIR}/protobuf/bin/protoc:
cd tools && git clone https://github.com/protocolbuffers/protobuf.git
cd tools/protobuf && git submodule update --init --recursive
cd tools/protobuf && ./autogen.sh
cd tools/protobuf && ./configure --prefix=${PWD}/tools/protobuf
cd tools/protobuf && ./configure --prefix=${TOOLSDIR}/protobuf
${MAKE} -C tools/protobuf
## for Mac users: use gcc to compile eflomal
@ -262,7 +260,7 @@ ${PWD}/tools/protobuf/bin/protoc:
## cd tools/efmoral
## sudo env python3 setup.py install
${PWD}/tools/eflomal/eflomal:
${TOOLSDIR}/eflomal/eflomal:
${MAKE} -C ${dir $@} all
cd ${dir $@} && python3 setup.py install
# python3 setup.py install --install-dir ${HOME}/.local

View File

@ -2,3 +2,4 @@ pycld2
iso-639
opustools
subword-nmt
sacrebleu