mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
started tutorial and fixes to backtranslate makefile
This commit is contained in:
parent
d11f74ce41
commit
ad828c3124
2
Makefile
2
Makefile
@ -370,8 +370,6 @@ translate-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemb
|
||||
eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
|
||||
|
||||
|
||||
|
||||
|
||||
## combined tasks:
|
||||
|
||||
|
||||
|
@ -4,6 +4,9 @@
|
||||
# only works with sentencepiece models!
|
||||
#
|
||||
|
||||
PWD := ${shell pwd}
|
||||
TOOLSDIR := ${PWD}/../tools
|
||||
|
||||
include ../lib/env.mk
|
||||
include ../lib/config.mk
|
||||
include ../lib/slurm.mk
|
||||
@ -41,7 +44,7 @@ MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
ifeq (${MODELNAME},)
|
||||
MODELHOME = ../work-langid/models/${LANGPAIR}
|
||||
MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
|
||||
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
@ -61,12 +64,15 @@ ifdef LOCAL_SCRATCH
|
||||
endif
|
||||
|
||||
|
||||
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
||||
module load nlpl-udpipe nlpl-opus &&
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
||||
module load nlpl-udpipe nlpl-opus &&
|
||||
endif
|
||||
|
||||
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
|
||||
ifneq (${wildcard index.html},)
|
||||
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
|
||||
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
|
||||
|
||||
endif
|
||||
|
||||
|
||||
|
||||
@ -93,7 +99,9 @@ PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard
|
||||
.PRECIOUS: ${WIKI_TRG}
|
||||
|
||||
## find wiki downloads
|
||||
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
|
||||
ifneq (${wildcard index.html},)
|
||||
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
|
||||
endif
|
||||
|
||||
## we don't need to keep the json file
|
||||
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}
|
||||
@ -104,15 +112,16 @@ ifndef UDPIPE_MODELS
|
||||
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
|
||||
endif
|
||||
|
||||
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
|
||||
LANGNAME = ${shell ${LOAD_MODULES} ${ISO639} -n ${LANGID} | sed 's/"//g' | \
|
||||
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
|
||||
|
||||
ifeq (${LANGNAME},)
|
||||
LANGNAME = xx
|
||||
endif
|
||||
|
||||
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
|
||||
|
||||
ifneq (${wildcard ${UDPIPE_MODELS}},)
|
||||
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
|
||||
endif
|
||||
|
||||
|
||||
all: index.html
|
||||
@ -120,6 +129,18 @@ all: index.html
|
||||
${MAKE} ${WIKI_LATEST_SRC}
|
||||
|
||||
|
||||
fetch-wiki fetch:
|
||||
mkdir -p wiki
|
||||
wget -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
|
||||
tar -C wiki -xf wiki/${SRC}.tar
|
||||
rm -f wiki/${SRC}.tar
|
||||
|
||||
fetch-wikidoc:
|
||||
mkdir -p wikidoc
|
||||
wget -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
|
||||
tar -C wikidoc -xf wikidoc/${SRC}.tar
|
||||
rm -f wikidoc/${SRC}.tar
|
||||
|
||||
## tatoeba = tatoeba-monolingual data and tatoeba-models
|
||||
## TODO: should we loop over all labels?
|
||||
|
||||
|
1
backtranslate/marian-dev
Symbolic link
1
backtranslate/marian-dev
Symbolic link
@ -0,0 +1 @@
|
||||
../tools/marian-dev
|
1
backtranslate/mosesdecoder
Symbolic link
1
backtranslate/mosesdecoder
Symbolic link
@ -0,0 +1 @@
|
||||
../tools/moses-scripts
|
@ -1,12 +1,6 @@
|
||||
# OPUS-MT-train tutorial
|
||||
|
||||
This tutorial goes through some common tasks with the example of training models to translate from Breton to English. First of all, clone the repository from github:
|
||||
|
||||
|
||||
```
|
||||
git clone git@github.com:Helsinki-NLP/OPUS-MT-train.git
|
||||
cd OPUS-MT-train
|
||||
```
|
||||
This tutorial goes through some common tasks with the example of training models to translate from English Breton to English. We assume that you have a working setup of all tools required. Check the [installation documentation](../Setup.md) for further information.
|
||||
|
||||
|
||||
## Basic configuration and data sets
|
||||
@ -15,23 +9,98 @@ cd OPUS-MT-train
|
||||
* create a local configuration file with language-specific settings
|
||||
|
||||
```
|
||||
make SRCLANGS=br TRGLANGS=en local-config
|
||||
make SRCLANGS=en TRGLANGS=br local-config
|
||||
```
|
||||
|
||||
|
||||
* create data sets, subword segmentation models, word alignments and model vocabulary
|
||||
* create data sets, subword segmentation models and NMT vocabulary
|
||||
|
||||
```
|
||||
make SRCLANGS=br TRGLANGS=en data
|
||||
make SRCLANGS=en TRGLANGS=br data
|
||||
```
|
||||
|
||||
This will also download the necessary files if they don't exist on the local file system. It will train sentence piece models for each language separately and apply the model to all data sets. Finally, it also creates the vocabulary file from the segmented training data.
|
||||
|
||||
|
||||
## Train the model
|
||||
|
||||
Training the model requires a GPU. Run this directly on a machine with appropriate hardware and CUDA libraries installed or submit a job to some GPU nodes on a cluster.
|
||||
|
||||
```
|
||||
make SRCLANGS=en TRGLANGS=br train
|
||||
```
|
||||
|
||||
Depending on the size of the data this will take hours, days or weeks to finish. The stopping criterion is set to 10 subsequent non-improved validation scores on validation data. For en-br this will take 1-2 hours. For submitting jobs have a look at the [documentation for batch jobs](BatchJobs.md).
|
||||
|
||||
Traininng can always be resumed in case the process crashes for some reason using the same command as above.
|
||||
|
||||
|
||||
|
||||
## Evaluate the model
|
||||
|
||||
Evaluation can be done at any time there is a model from one of the validation steps or the final model after convergence. Running the translation and evaluation of the given test set is done by calling:
|
||||
|
||||
```
|
||||
make SRCLANGS=en TRGLANGS=br translate
|
||||
make SRCLANGS=en TRGLANGS=br eval
|
||||
```
|
||||
|
||||
Translation runs, naturally, faster on a GPU but can also be done in reasonable time on CPU cores. You can add cores by setting the `THREADS` variable. Evaluation is done using sacrebleu and translations as wells as BLEU/chrF2 scores ars stored in the work directory, in this case in
|
||||
|
||||
```
|
||||
work/en-br/Tatoeba.opus.spm4k-spm4k1.transformer.en.br
|
||||
work/en-br/Tatoeba.opus.spm4k-spm4k1.transformer.en.br.eval
|
||||
```
|
||||
|
||||
There is also a recipe for merging input, reference translation and system output into one file for better readability:
|
||||
|
||||
```
|
||||
make SRCLANGS=en TRGLANGS=br compare
|
||||
less work/en-br/Tatoeba.opus.spm4k-spm4k1.transformer.en.br.compare
|
||||
```
|
||||
|
||||
Translation and evaluation files will be overwritten when a new model appears and the evaluation target is called again.
|
||||
|
||||
|
||||
|
||||
## Generate back-translations
|
||||
|
||||
Back-translation requires a moel in the opposite direction. First thing to do is to reverse the data. This can be done without generating them from scratch:
|
||||
|
||||
```
|
||||
make SRCLANGS=en TRGLANGS=br reverse-data
|
||||
```
|
||||
|
||||
Now train a new model but in the opposite direction:
|
||||
|
||||
```
|
||||
make SRCLANGS=br TRGLANGS=en train
|
||||
```
|
||||
|
||||
After training we need to create a package to be used by back-translation. Run this to create a package in `work/models/br-en`:
|
||||
|
||||
```
|
||||
make SRCLANGS=br TRGLANGS=en dist
|
||||
```
|
||||
|
||||
|
||||
The next step is to fetch some monolingual data to be back-translated. OPUS-MT is prepared to use Wiki data from various Wikimedia wikis (Wikipedia, Wikiquote, Wikisource, Wikibooks, Wikinews). You can fetch the prepared data sets by running:
|
||||
|
||||
```
|
||||
make -C backtranslation SRC=br fetch-wikidoc
|
||||
```
|
||||
|
||||
Finally, we can translate the Breton Wikipedia to English using the br-en model we have trained above (run this on a GPU machine):
|
||||
|
||||
```
|
||||
make -C backtranslation SRC=br translate
|
||||
```
|
||||
|
||||
The translations are most probably really bad as the back-translation model is vry poor (around 4 BLEU).
|
||||
|
||||
|
||||
|
||||
|
||||
## Generate pivot-based translations
|
||||
|
||||
## Re-train using back-translations and pivot translations
|
||||
|
@ -183,8 +183,6 @@ url-exists = ${shell if [ "${call url-status,${1}}" == "HTTP/1.1 200 OK" ]; then
|
||||
resource-url = ${shell echo "${OPUS_STORE}${3}/${call get-opus-version,${1},${2},${3}}/moses/${1}-${2}.txt.zip"}
|
||||
|
||||
|
||||
|
||||
|
||||
## exclude certain data sets
|
||||
# EXCLUDE_CORPORA ?= WMT-News MPC1 ${call get-elra-bitexts,${SRC},${TRG}}
|
||||
EXCLUDE_CORPORA ?= WMT-News MPC1
|
||||
|
10
lib/data.mk
10
lib/data.mk
@ -144,9 +144,15 @@ ifeq (${words ${TRGLANGS}},1)
|
||||
-if [ -e ${MODEL_VOCAB} ]; then \
|
||||
ln -s ${MODEL_VOCAB} ${REV_WORKDIR}/${notdir ${MODEL_VOCAB}}; \
|
||||
fi
|
||||
##
|
||||
## this is a bit dangerous with some trick to
|
||||
## swap parameters between SRC and TRG
|
||||
##
|
||||
-if [ -e ${WORKDIR}/config.mk ]; then \
|
||||
if [ ! -e ${REV_WORKDIR}/config.mk ]; then \
|
||||
cp ${WORKDIR}/config.mk ${REV_WORKDIR}/config.mk; \
|
||||
cat ${WORKDIR}/config.mk |\
|
||||
sed -e 's/SRC/TTT/g;s/TRG/SRC/g;s/TTT/TRG/' |\
|
||||
grep -v LANGPAIRSTR > ${REV_WORKDIR}/config.mk; \
|
||||
fi \
|
||||
fi
|
||||
endif
|
||||
@ -155,6 +161,8 @@ endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
clean-data:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
|
52
lib/env.mk
52
lib/env.mk
@ -5,6 +5,7 @@
|
||||
# - system-specific settings
|
||||
#
|
||||
|
||||
SHELL := /bin/bash
|
||||
|
||||
## modules to be loaded in sbatch scripts
|
||||
|
||||
@ -52,6 +53,7 @@ WORKHOME = ${PWD}/work
|
||||
|
||||
|
||||
ifeq (${shell hostname},dx6-ibs-p2)
|
||||
GPU = pascal
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
# OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
@ -61,18 +63,13 @@ ifeq (${shell hostname},dx6-ibs-p2)
|
||||
# MARIAN = ${APPLHOME}/marian/build
|
||||
# SUBWORD_HOME = ${APPLHOME}/subword-nmt/subword_nmt
|
||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||
GPU = pascal
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
MARIAN_BUILD_OPTIONS += -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-9.2 \
|
||||
-DPROTOBUF_LIBRARY=${PWD}/tools/protobuf/lib/libprotobuf.so \
|
||||
-DPROTOBUF_INCLUDE_DIR=${PWD}/tools/protobuf/include/google/protobuf \
|
||||
-DPROTOBUF_PROTOC_EXECUTABLE=${PWD}/tools/protobuf/bin/protoc
|
||||
MARIAN_BUILD_OPTIONS += -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-9.2
|
||||
# -DPROTOBUF_LIBRARY=/usr/lib/x86_64-linux-gnu/libprotobuf.so.9 \
|
||||
# -DPROTOBUF_INCLUDE_DIR=/usr/include/google/protobuf \
|
||||
# -DPROTOBUF_PROTOC_EXECUTABLE=${PWD}/tools/protobuf/src/protoc
|
||||
# -DPROTOBUF_LIBRARY=${PWD}/tools/protobuf/src/libprotobuf.la \
|
||||
# -DPROTOBUF_INCLUDE_DIR=${PWD}/tools/protobuf/src/google/ \
|
||||
# -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc
|
||||
# OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
# MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
# MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
@ -117,23 +114,24 @@ TMPDIR ?= /tmp
|
||||
## tools and their locations
|
||||
|
||||
SCRIPTDIR ?= ${PWD}/scripts
|
||||
TOOLSDIR ?= ${PWD}/tools
|
||||
|
||||
ISO639 ?= ${shell which iso639 || echo 'perl ${PWD}/tools/LanguageCodes/ISO-639-3/bin/iso639'}
|
||||
PIGZ ?= ${shell which pigz || echo ${PWD}/tools/pigz/pigz}
|
||||
TERASHUF ?= ${shell which terashuf || echo ${PWD}/tools/terashuf/terashuf}
|
||||
JQ ?= ${shell which jq || echo ${PWD}/tools/jq/jq}
|
||||
PROTOC ?= ${shell which protoc || echo ${PWD}/tools/protobuf/bin/protoc}
|
||||
MARIAN ?= ${shell which marian || echo ${PWD}/tools/marian-dev/build/marian}
|
||||
ISO639 ?= ${shell which iso639 || echo 'perl ${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639'}
|
||||
PIGZ ?= ${shell which pigz || echo ${TOOLSDIR}/pigz/pigz}
|
||||
TERASHUF ?= ${shell which terashuf || echo ${TOOLSDIR}/terashuf/terashuf}
|
||||
JQ ?= ${shell which jq || echo ${TOOLSDIR}/jq/jq}
|
||||
PROTOC ?= ${shell which protoc || echo ${TOOLSDIR}/protobuf/bin/protoc}
|
||||
MARIAN ?= ${shell which marian || echo ${TOOLSDIR}/marian-dev/build/marian}
|
||||
MARIAN_HOME ?= $(dir ${MARIAN})
|
||||
SPM_HOME ?= ${dir ${MARIAN}}
|
||||
FASTALIGN ?= ${shell which fast_align || echo ${PWD}/tools/fast_align/build/fast_align}
|
||||
FASTALIGN ?= ${shell which fast_align || echo ${TOOLSDIR}/fast_align/build/fast_align}
|
||||
FASTALIGN_HOME ?= ${dir ${FASTALIGN}}
|
||||
ATOOLS ?= ${FASTALIGN_HOME}atools
|
||||
EFLOMAL ?= ${shell which eflomal || echo ${PWD}/tools/eflomal/eflomal}
|
||||
EFLOMAL ?= ${shell which eflomal || echo ${TOOLSDIR}/eflomal/eflomal}
|
||||
EFLOMAL_HOME ?= ${dir ${EFLOMAL}}
|
||||
WORDALIGN ?= ${EFLOMAL_HOME}align.py
|
||||
EFLOMAL ?= ${EFLOMAL_HOME}eflomal
|
||||
MOSESSCRIPTS ?= ${PWD}/tools/moses-scripts/scripts
|
||||
MOSESSCRIPTS ?= ${TOOLSDIR}/moses-scripts/scripts
|
||||
|
||||
|
||||
## marian-nmt binaries
|
||||
@ -149,7 +147,7 @@ TOKENIZER = ${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
|
||||
## BPE
|
||||
SUBWORD_BPE ?= ${shell which subword-nmt || echo ${PWD}/tools/subword-nmt/subword_nmt/subword_nmt.py}
|
||||
SUBWORD_BPE ?= ${shell which subword-nmt || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
|
||||
SUBWORD_HOME ?= ${dir ${SUBWORD_BPE}}
|
||||
ifeq (${shell which subword-nmt},)
|
||||
BPE_LEARN ?= pyhton3 ${SUBWORD_HOME}/learn_bpe.py
|
||||
@ -209,24 +207,24 @@ install-prerequisites install-prereq install-requirements:
|
||||
${MAKE} ${PREREQ_TOOLS}
|
||||
|
||||
|
||||
${PWD}/tools/LanguageCodes/ISO-639-3/bin/iso639:
|
||||
${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639:
|
||||
${MAKE} tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm
|
||||
|
||||
${PWD}/tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm:
|
||||
${TOOLSDIR}/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm:
|
||||
${MAKE} -C tools/LanguageCodes all
|
||||
|
||||
${PWD}/tools/fast_align/build/atools:
|
||||
${TOOLSDIR}/fast_align/build/atools:
|
||||
mkdir -p ${dir $@}
|
||||
cd ${dir $@} && cmake ..
|
||||
${MAKE} -C ${dir $@}
|
||||
|
||||
${PWD}/tools/pigz/pigz:
|
||||
${TOOLSDIR}/pigz/pigz:
|
||||
${MAKE} -C ${dir $@}
|
||||
|
||||
${PWD}/tools/terashuf/terashuf:
|
||||
${TOOLSDIR}/terashuf/terashuf:
|
||||
${MAKE} -C ${dir $@}
|
||||
|
||||
${PWD}/tools/jq/jq:
|
||||
${TOOLSDIR}/jq/jq:
|
||||
cd ${dir $@} && git submodule update --init
|
||||
cd ${dir $@} && autoreconf -fi
|
||||
cd ${dir $@} && ./configure --with-oniguruma=builtin
|
||||
@ -237,16 +235,16 @@ ${PWD}/tools/jq/jq:
|
||||
## - install MKL (especially for cpu use):
|
||||
## file:///opt/intel/documentation_2020/en/mkl/ps2020/get_started.htm
|
||||
|
||||
${PWD}/tools/marian-dev/build/marian: ${PROTOC}
|
||||
${TOOLSDIR}/marian-dev/build/marian: ${PROTOC}
|
||||
mkdir -p ${dir $@}
|
||||
cd ${dir $@} && cmake -DUSE_SENTENCEPIECE=on ${MARIAN_BUILD_OPTIONS} ..
|
||||
${MAKE} -C ${dir $@} -j
|
||||
|
||||
${PWD}/tools/protobuf/bin/protoc:
|
||||
${TOOLSDIR}/protobuf/bin/protoc:
|
||||
cd tools && git clone https://github.com/protocolbuffers/protobuf.git
|
||||
cd tools/protobuf && git submodule update --init --recursive
|
||||
cd tools/protobuf && ./autogen.sh
|
||||
cd tools/protobuf && ./configure --prefix=${PWD}/tools/protobuf
|
||||
cd tools/protobuf && ./configure --prefix=${TOOLSDIR}/protobuf
|
||||
${MAKE} -C tools/protobuf
|
||||
|
||||
## for Mac users: use gcc to compile eflomal
|
||||
@ -262,7 +260,7 @@ ${PWD}/tools/protobuf/bin/protoc:
|
||||
## cd tools/efmoral
|
||||
## sudo env python3 setup.py install
|
||||
|
||||
${PWD}/tools/eflomal/eflomal:
|
||||
${TOOLSDIR}/eflomal/eflomal:
|
||||
${MAKE} -C ${dir $@} all
|
||||
cd ${dir $@} && python3 setup.py install
|
||||
# python3 setup.py install --install-dir ${HOME}/.local
|
||||
|
@ -2,3 +2,4 @@ pycld2
|
||||
iso-639
|
||||
opustools
|
||||
subword-nmt
|
||||
sacrebleu
|
||||
|
Loading…
Reference in New Issue
Block a user