OPUS-MT-train/backtranslate/Makefile

401 lines
13 KiB
Makefile
Raw Normal View History

2020-01-11 01:29:06 +03:00
#
# backtranslate wiki data
#
# only works with sentencepiece models!
#
include ../Makefile.env
include ../Makefile.config
include ../Makefile.slurm
SRC = af
TRG = en
2020-01-12 02:10:53 +03:00
## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE = wiki
2020-01-11 01:29:06 +03:00
2020-01-12 02:10:53 +03:00
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH = 100
MAX_SENTENCES = 1000000
2020-01-19 20:00:13 +03:00
PART = aa
2020-01-11 01:29:06 +03:00
LANGPAIR = ${SRC}-${TRG}
MODELHOME = ../models/${LANGPAIR}
2020-01-12 02:10:53 +03:00
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
2020-01-11 01:29:06 +03:00
2020-01-22 14:33:28 +03:00
ifeq (${MODELNAME},)
2020-02-14 01:12:55 +03:00
MODELHOME = ../work-langid/models/${LANGPAIR}
2020-02-12 00:20:11 +03:00
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
2020-01-22 14:33:28 +03:00
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
2020-01-10 17:45:42 +03:00
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
2020-01-12 02:10:53 +03:00
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
2020-01-10 17:45:42 +03:00
2020-01-11 01:29:06 +03:00
LANGID = ${SRC}
2020-01-21 00:37:40 +03:00
WIKI_DIR = wiki/${LANGID}
2020-01-19 20:00:13 +03:00
WIKI_TXT = ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.${PART}.gz
WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
2020-01-12 02:10:53 +03:00
2020-01-29 22:46:18 +03:00
## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
2020-01-12 02:10:53 +03:00
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
2020-01-11 01:29:06 +03:00
## find wiki downloads
2020-01-12 02:10:53 +03:00
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
## we don't need to keep the json file
2020-01-19 20:00:13 +03:00
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}
2020-01-12 02:10:53 +03:00
2020-01-10 17:45:42 +03:00
2020-01-11 01:29:06 +03:00
## find UDPipe model
2020-01-12 02:10:53 +03:00
ifndef UDPIPE_MODELS
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
endif
2020-01-10 17:45:42 +03:00
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
2020-01-11 01:29:06 +03:00
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
2020-01-10 17:45:42 +03:00
all: index.html
2020-01-11 01:29:06 +03:00
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
2020-01-21 00:37:40 +03:00
2020-02-12 00:20:11 +03:00
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
2020-01-21 00:37:40 +03:00
2020-02-12 00:20:11 +03:00
all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
2020-01-21 00:37:40 +03:00
for w in ${WIKISOURCES}; do \
2020-01-24 14:39:21 +03:00
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit"; \
2020-01-29 22:46:18 +03:00
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
2020-01-22 14:33:28 +03:00
fi \
2020-01-21 00:37:40 +03:00
done
all-wikitext:
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
done
2020-01-21 00:37:40 +03:00
all-wikilangs: index.html
2020-01-10 17:45:42 +03:00
for l in ${WIKILANGS}; do \
2020-01-12 02:10:53 +03:00
${MAKE} LANGID=$$l extract-text; \
2020-01-10 17:45:42 +03:00
done
2020-01-11 01:29:06 +03:00
## for Breton: use the multilingual celtic model to backtranslate
breton:
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis
## do the same for all Celtic languages in the model
celtic:
for l in ga cy br gd kv gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
done
2020-01-22 14:33:28 +03:00
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
focus-wikis:
for l in tl bcl ml bn mn; do \
${MAKE} SRC=$$l TRG=en all-wikis; \
done
get-data: ${WIKI_JSON}
2020-01-12 02:10:53 +03:00
extract-text: ${WIKI_TXT}
2020-02-12 00:20:11 +03:00
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
2020-01-11 01:29:06 +03:00
prepare-data: ${WIKI_PRE}
translate: ${WIKI_SRC} ${WIKI_TRG}
2020-01-10 17:45:42 +03:00
2020-01-29 22:46:18 +03:00
## translate all parts
2020-02-12 00:20:11 +03:00
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
2020-01-29 22:46:18 +03:00
for p in ${PARTS}; do \
${MAKE} PART=$$p translate; \
done
## create jobs for translating all parts
2020-02-12 00:20:11 +03:00
submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
2020-01-29 22:46:18 +03:00
for p in ${PARTS}; do \
${MAKE} PART=$$p translate.submit; \
done
2020-01-10 17:45:42 +03:00
print-names:
echo ${LANGNAME}
echo ${UDPIPE_MODEL}
echo ${WIKI_JSON}
2020-02-14 01:12:55 +03:00
echo ${MODELNAME}
2020-01-10 17:45:42 +03:00
2020-01-12 02:10:53 +03:00
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
2020-01-11 01:29:06 +03:00
2020-01-12 02:10:53 +03:00
${LANGPAIR}/${MODELNAME}/decoder.yml:
2020-01-11 01:29:06 +03:00
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
cp ${MODELZIP} ${dir $@}
cd ${dir $@} && unzip *.zip
endif
2020-01-12 02:10:53 +03:00
## pre-process data
## ---> TODO: does that work for multilingual data that need prefix?
2020-01-19 20:00:13 +03:00
${LANGPAIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${SRC}.${PART}.gz
2020-01-11 01:29:06 +03:00
ifneq (${MODELZIP},)
2020-01-12 02:10:53 +03:00
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
2020-01-11 01:29:06 +03:00
zcat $< |\
2020-01-24 14:39:21 +03:00
grep -v '[<>{}]' |\
2020-01-12 02:10:53 +03:00
${LANGPAIR}/${MODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
head -${MAX_SENTENCES} |\
2020-01-11 01:29:06 +03:00
gzip -c > $@
endif
2020-01-12 02:10:53 +03:00
2020-01-24 14:39:21 +03:00
2020-01-12 02:10:53 +03:00
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
2020-01-11 01:29:06 +03:00
${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
2020-01-12 02:10:53 +03:00
mkdir -p ${dir $@}
2020-01-11 01:29:06 +03:00
zcat $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
gzip -c > $@
endif
2020-01-12 02:10:53 +03:00
## translate
2020-01-11 01:29:06 +03:00
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},)
2020-01-12 02:10:53 +03:00
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN}/marian-decoder \
2020-01-11 01:29:06 +03:00
-i ${PWD}/$< \
-c decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
2020-01-12 02:10:53 +03:00
ifneq (${LANGPAIR},)
ifneq (${MODELNAME},)
rm -fr ${LANGPAIR}/${MODELNAME}
endif
endif
2020-01-11 01:29:06 +03:00
endif
## index of all downloadable files
index.html:
2020-01-12 02:10:53 +03:00
wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
2020-01-11 01:29:06 +03:00
## wiki in json format
2020-01-10 17:45:42 +03:00
${WIKI_JSON}:
2020-01-12 02:10:53 +03:00
wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
2020-01-10 17:45:42 +03:00
2020-01-12 02:10:53 +03:00
## check whether there is a UDPipe model
## backoff to moses tools
2020-01-11 01:29:06 +03:00
ifneq (${UDPIPE_MODEL},)
SENTSPLITTER = udpipe --input=horizontal --tokenize \
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
grep '^\# *text *= *' |\
sed 's/^\# *text *= *//'
else
SENTSPLITTER = ${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID}
endif
## extract sentences and normalize
## - requires jq, udpipe, and moses-scripts
2020-01-10 17:45:42 +03:00
${WIKI_TXT}: ${WIKI_JSON}
2020-01-19 20:00:13 +03:00
mkdir -p ${dir $@}
2020-01-10 17:45:42 +03:00
${LOAD_MODULES} \
zcat $< | jq -r '.text' | \
grep -v 'null' |\
2020-01-24 14:39:21 +03:00
grep -v '[<>{}]' |\
2020-01-11 01:29:06 +03:00
${SENTSPLITTER} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
2020-01-19 20:00:13 +03:00
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}*
# $(TOKENIZER)/normalize-punctuation.perl |\
2020-01-19 20:00:13 +03:00
## OLD: without splitting into parts
#
# ${WIKI_TXT}: ${WIKI_JSON}
# ${LOAD_MODULES} \
# zcat $< | jq -r '.text' | \
# grep -v 'null' |\
# ${SENTSPLITTER} |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/normalize-punctuation.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
# gzip -c > $@
2020-01-10 17:45:42 +03:00
2020-01-11 01:29:06 +03:00
2020-01-10 17:45:42 +03:00
# afrikaans-afribooms-ud-2.4-190531.udpipe af
# ancient_greek-perseus-ud-2.4-190531.udpipe
# ancient_greek-proiel-ud-2.4-190531.udpipe
# arabic-padt-ud-2.4-190531.udpipe ar
# armenian-armtdp-ud-2.4-190531.udpipe
# basque-bdt-ud-2.4-190531.udpipe eo
# belarusian-hse-ud-2.4-190531.udpipe
# bulgarian-btb-ud-2.4-190531.udpipe bg
# catalan-ancora-ud-2.4-190531.udpipe ca
# chinese-gsd-ud-2.4-190531.udpipe zh
# classical_chinese-kyoto-ud-2.4-190531.udpipe zh_tw
# coptic-scriptorium-ud-2.4-190531.udpipe
# croatian-set-ud-2.4-190531.udpipe hr
# czech-cac-ud-2.4-190531.udpipe cs
# czech-cltt-ud-2.4-190531.udpipe cs
# czech-fictree-ud-2.4-190531.udpipe cs
# czech-pdt-ud-2.4-190531.udpipe cs
# danish-ddt-ud-2.4-190531.udpipe da
# dutch-alpino-ud-2.4-190531.udpipe nl
# dutch-lassysmall-ud-2.4-190531.udpipe nl
# english-ewt-ud-2.4-190531.udpipe en
# english-gum-ud-2.4-190531.udpipe en
# english-lines-ud-2.4-190531.udpipe en
# english-partut-ud-2.4-190531.udpipe en
# estonian-edt-ud-2.4-190531.udpipe et
# estonian-ewt-ud-2.4-190531.udpipe et
# finnish-ftb-ud-2.4-190531.udpipe fi
# finnish-tdt-ud-2.4-190531.udpipe fi
# french-gsd-ud-2.4-190531.udpipe fr
# french-partut-ud-2.4-190531.udpipe fr
# french-sequoia-ud-2.4-190531.udpipe fr
# french-spoken-ud-2.4-190531.udpipe fr
# galician-ctg-ud-2.4-190531.udpipe gl
# galician-treegal-ud-2.4-190531.udpipe gl
# german-gsd-ud-2.4-190531.udpipe de
# gothic-proiel-ud-2.4-190531.udpipe
# greek-gdt-ud-2.4-190531.udpipe el
# hebrew-htb-ud-2.4-190531.udpipe he
# hindi-hdtb-ud-2.4-190531.udpipe hi
# hungarian-szeged-ud-2.4-190531.udpipe hu
# indonesian-gsd-ud-2.4-190531.udpipe id
# irish-idt-ud-2.4-190531.udpipe cy
# italian-isdt-ud-2.4-190531.udpipe it
# italian-partut-ud-2.4-190531.udpipe it
# italian-postwita-ud-2.4-190531.udpipe it
# italian-vit-ud-2.4-190531.udpipe it
# japanese-gsd-ud-2.4-190531.udpipe ja
# korean-gsd-ud-2.4-190531.udpipe ko
# korean-kaist-ud-2.4-190531.udpipe ko
# latin-ittb-ud-2.4-190531.udpipe la
# latin-perseus-ud-2.4-190531.udpipe la
# latin-proiel-ud-2.4-190531.udpipe la
# latvian-lvtb-ud-2.4-190531.udpipe lv
# lithuanian-alksnis-ud-2.4-190531.udpipe lt
# lithuanian-hse-ud-2.4-190531.udpipe lt
# maltese-mudt-ud-2.4-190531.udpipe mt
# marathi-ufal-ud-2.4-190531.udpipe
# north_sami-giella-ud-2.4-190531.udpipe
# norwegian-bokmaal-ud-2.4-190531.udpipe nb
# norwegian-nynorsklia-ud-2.4-190531.udpipe nn
# norwegian-nynorsk-ud-2.4-190531.udpipe nn
# old_church_slavonic-proiel-ud-2.4-190531.udpipe
# old_french-srcmf-ud-2.4-190531.udpipe
# old_russian-torot-ud-2.4-190531.udpipe
# persian-seraji-ud-2.4-190531.udpipe fa
# polish-lfg-ud-2.4-190531.udpipe pl
# polish-pdb-ud-2.4-190531.udpipe pl
# portuguese-bosque-ud-2.4-190531.udpipe pt
# portuguese-gsd-ud-2.4-190531.udpipe pt
# romanian-nonstandard-ud-2.4-190531.udpipe ro
# romanian-rrt-ud-2.4-190531.udpipe ro
# russian-gsd-ud-2.4-190531.udpipe ru
# russian-syntagrus-ud-2.4-190531.udpipe ru
# russian-taiga-ud-2.4-190531.udpipe ru
# serbian-set-ud-2.4-190531.udpipe sr
# slovak-snk-ud-2.4-190531.udpipe sk
# slovenian-ssj-ud-2.4-190531.udpipe sl
# slovenian-sst-ud-2.4-190531.udpipe sl
# spanish-ancora-ud-2.4-190531.udpipe es
# spanish-gsd-ud-2.4-190531.udpipe es
# swedish-lines-ud-2.4-190531.udpipe sv
# swedish-talbanken-ud-2.4-190531.udpipe sv
# tamil-ttb-ud-2.4-190531.udpipe
# telugu-mtg-ud-2.4-190531.udpipe
# turkish-imst-ud-2.4-190531.udpipe tr
# ukrainian-iu-ud-2.4-190531.udpipe
# urdu-udtb-ud-2.4-190531.udpipe
# uyghur-udt-ud-2.4-190531.udpipe
# vietnamese-vtb-ud-2.4-190531.udpipe vi
# wolof-wtb-ud-2.4-190531.udpipe
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-general.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-general.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-general.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-general.json.gz
# enwiki-20191209-cirrussearch-content.json.gz 10-Dec-2019 11:04 22622822308
# enwiki-20191209-cirrussearch-content.json.gz.tmp 10-Dec-2019 10:57 21460369408
# enwiki-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:22 50406322974
# enwiki-20191209-cirrussearch-general.json.gz.tmp 10-Dec-2019 15:50 44720914432
# enwikibooks-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:24 319454731
# enwikibooks-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:25 97206925
# enwikinews-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:26 53746769
# enwikinews-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:36 364098656
# enwikiquote-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:38 234637326
# enwikiquote-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:38 66848855
# enwikisource-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:09 5236203374
# enwikisource-20191209-cirrussearch-content.json..> 10-Dec-2019 17:06 4597481472
# enwikisource-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:11 152492247
# enwikiversity-20191209-cirrussearch-content.jso..> 10-Dec-2019 17:12 145288148
# enwikiversity-20191209-cirrussearch-general.jso..> 10-Dec-2019 17:13 193051475
# enwikivoyage-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:14 179134384
# enwikivoyage-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:15 99357806
# enwiktionary-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:36 2319801836
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23 918503424
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42 848846623
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40 661585920