mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
942 lines
28 KiB
Makefile
942 lines
28 KiB
Makefile
#
|
|
# backtranslate wiki data
|
|
#
|
|
# only works with sentencepiece models!
|
|
#
|
|
|
|
PWD := ${shell pwd}
|
|
REPOHOME := ${PWD}/../
|
|
TOOLSDIR := ${REPOHOME}tools
|
|
|
|
include ${REPOHOME}lib/env.mk
|
|
include ${REPOHOME}lib/config.mk
|
|
include ${REPOHOME}lib/slurm.mk
|
|
|
|
|
|
SRC ?= af
|
|
TRG ?= en
|
|
|
|
## various sources are available
|
|
## can be general wikipedia, wikinews, wikibooks, ...
|
|
WIKISOURCE ?= wiki
|
|
|
|
## split size in nr-of-lines
|
|
## default part to be selected = aa
|
|
SPLIT_SIZE ?= 1000000
|
|
PART ?= aa
|
|
|
|
## maximum input length (number sentence piece segments)
|
|
## maximum number of sentences to be translated (top N lines)
|
|
MAX_LENGTH ?= 100
|
|
MAX_SENTENCES ?= ${SPLIT_SIZE}
|
|
|
|
|
|
LANGPAIR = ${SRC}-${TRG}
|
|
|
|
PWD := $(shell pwd)
|
|
|
|
MODELSDIR ?= ../models
|
|
MODELHOME ?= ${MODELSDIR}/${LANGPAIR}
|
|
## standard sort is different from UTF8-based sort
|
|
## --> prefer models with augmented data sets (separated by +)
|
|
## we need the UTF8 sort order
|
|
## --> use bash sort and UTF8 locale
|
|
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
|
ifneq (${wildcard ${MODELHOME}},)
|
|
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
|
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
|
endif
|
|
|
|
ifeq (${MODELNAME},)
|
|
ifneq (${wildcard ${WORKHOME/models/${LANGPAIR}}},)
|
|
MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
|
|
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
|
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
|
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
|
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
|
endif
|
|
endif
|
|
|
|
## set to 1 if the model for backtranslation is a multi-target model
|
|
## --> need to use pre-processing script differently
|
|
ifeq ($(words $(subst +, ,$(lastword $(subst -, ,$(notdir ${MODELHOME}))))),1)
|
|
MULTI_TARGET_MODEL = 0
|
|
else
|
|
MULTI_TARGET_MODEL = 1
|
|
endif
|
|
|
|
|
|
ifdef LOCAL_SCRATCH
|
|
TMPDIR = ${LOCAL_SCRATCH}
|
|
endif
|
|
|
|
|
|
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
|
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
|
module load nlpl-udpipe nlpl-opus &&
|
|
endif
|
|
|
|
ifneq (${wildcard index.html},)
|
|
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
|
|
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
|
|
endif
|
|
|
|
|
|
|
|
LANGID = ${SRC}
|
|
WIKI_HOME = wiki
|
|
WIKIDOC_HOME = wikidoc
|
|
WIKI_DIR = ${WIKI_HOME}/${LANGID}
|
|
OUTPUT_DIR = ${LANGPAIR}
|
|
WIKI_TXT = ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.${PART}.gz
|
|
WIKI_DOC = ${WIKIDOC_HOME}/${LANGID}/${WIKISOURCE}.${LANGID}.gz
|
|
WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
|
|
WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
|
|
WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
|
|
|
WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
|
|
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
|
|
|
|
|
|
## all parts of this wiki
|
|
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
|
|
|
|
|
|
## don't delete translated text if the process crashes
|
|
.PRECIOUS: ${WIKI_TRG}
|
|
|
|
## find wiki downloads
|
|
ifneq (${wildcard index.html},)
|
|
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
|
|
endif
|
|
|
|
## we don't need to keep the json file
|
|
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}
|
|
|
|
|
|
## find UDPipe model
|
|
ifndef UDPIPE_MODELS
|
|
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
|
|
endif
|
|
|
|
LANGNAME = ${shell ${LOAD_MODULES} ${ISO639} -n ${LANGID} | sed 's/"//g' | \
|
|
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
|
|
|
|
ifeq (${LANGNAME},)
|
|
LANGNAME = xx
|
|
endif
|
|
|
|
ifneq (${wildcard ${UDPIPE_MODELS}},)
|
|
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
|
|
endif
|
|
|
|
|
|
all: index.html
|
|
${MAKE} ${WIKI_LATEST_TRG}
|
|
${MAKE} ${WIKI_LATEST_SRC}
|
|
|
|
|
|
## store wikidata on allas to make them accessible for everyone
|
|
## requires configuration for allas project OPUS-MT
|
|
|
|
store-wikidocs:
|
|
cd wikidoc && a-put -b OPUS-MT-bt-wikidoc --nc --follow-links --override *
|
|
swift post OPUS-MT-bt-wikidoc --read-acl ".r:*"
|
|
|
|
store-wiki:
|
|
cd wiki && a-put -b OPUS-MT-bt-wiki --nc --follow-links --override *
|
|
swift post OPUS-MT-bt-wiki --read-acl ".r:*"
|
|
|
|
fetch-wiki fetch:
|
|
mkdir -p wiki
|
|
${WGET} -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
|
|
tar -C wiki -xf wiki/${SRC}.tar
|
|
rm -f wiki/${SRC}.tar
|
|
|
|
fetch-wikidoc:
|
|
mkdir -p wikidoc
|
|
${WGET} -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
|
|
tar -C wikidoc -xf wikidoc/${SRC}.tar
|
|
rm -f wikidoc/${SRC}.tar
|
|
|
|
|
|
## tatoeba = tatoeba-monolingual data and tatoeba-models
|
|
## TODO: should we loop over all labels?
|
|
|
|
%-tatoeba:
|
|
${MAKE} WIKI_HOME=../work-tatoeba/data/mono \
|
|
WIKISOURCES="wikipedia wikibooks wikinews wikiquote wikisource" \
|
|
MODELSDIR=../models-tatoeba \
|
|
${@:-tatoeba=}
|
|
|
|
# %-tatoeba:
|
|
# ${MAKE} WIKI_HOME=wiki-iso639-3 \
|
|
# WIKIDOC_HOME=wikidoc-iso639-3 \
|
|
# MODELSDIR=../models-tatoeba \
|
|
# ${@:-tatoeba=}
|
|
|
|
|
|
|
|
## make ISO639-3 conform file links
|
|
wiki-iso639:
|
|
for l in ${WIKILANGS}; do \
|
|
i=`iso639 -3 -n $$l`; \
|
|
mkdir -p wiki-iso639-3/$$i; \
|
|
for d in `ls wiki/$$l/*.gz`; do \
|
|
ln -s ${PWD}/$$d wiki-iso639-3/$$i/`basename $$d | sed "s/\.$$l\.\(..\.gz\)/.$$i.\1/"`; \
|
|
done \
|
|
done
|
|
|
|
wiki-iso639-doc:
|
|
for l in ${WIKILANGS}; do \
|
|
i=`iso639 -3 -n $$l`; \
|
|
mkdir -p wikidoc-iso639-3/$$i; \
|
|
for d in `ls wikidoc/$$l/*.gz`; do \
|
|
ln -s ${PWD}/$$d wikidoc-iso639-3/$$i/`basename $$d | sed "s/\.$$l\.\(..\.gz\)/.$$i.\1/"`; \
|
|
done \
|
|
done
|
|
|
|
|
|
|
|
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
|
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
|
|
|
|
.PHONY: translate-all-wikis
|
|
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
|
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
|
|
echo "${MAKE} WIKISOURCE=$$w translate"; \
|
|
${MAKE} WIKISOURCE=$$w translate; \
|
|
fi \
|
|
done
|
|
|
|
.PHONY: translate-all-wikiparts
|
|
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
|
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
|
|
echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
|
|
${MAKE} WIKISOURCE=$$w translate-all-parts; \
|
|
fi \
|
|
done
|
|
|
|
|
|
translate-all-wikis-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
|
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
|
|
echo "${MAKE} WIKISOURCE=$$w translate"; \
|
|
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
|
|
fi \
|
|
done
|
|
|
|
translate-all-wikiparts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
|
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
|
|
echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
|
|
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate-all-parts-jobs; \
|
|
fi \
|
|
done
|
|
|
|
|
|
all-wikitext:
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
done
|
|
|
|
all-wikilangs: index.html
|
|
for l in ${WIKILANGS}; do \
|
|
${MAKE} LANGID=$$l extract-text; \
|
|
done
|
|
|
|
all-wikilangs-fast: index.html
|
|
for l in ${WIKILANGS}; do \
|
|
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
|
|
LANGID=$$l extract-text; \
|
|
done
|
|
|
|
all-wikis-all-langs: index.html
|
|
for l in ${WIKILANGS}; do \
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} WIKISOURCE=$$w LANGID=$$l extract-text; \
|
|
done \
|
|
done
|
|
|
|
|
|
## aux function to print the selected modelname
|
|
.PHONY: print-modelname
|
|
print-modelname:
|
|
@echo ${MODELNAME}
|
|
@echo ${MODELZIP}
|
|
@echo "${sort ${wildcard ${MODELHOME}/*-20*.zip}}"
|
|
|
|
|
|
|
|
|
|
all-wikidocs-all-langs: index.html
|
|
for l in ${WIKILANGS}; do \
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
|
|
done \
|
|
done
|
|
|
|
|
|
## nordic language wikis
|
|
|
|
all-nordic-wikidocs:
|
|
for l in da et fi fo is nn no sv; do \
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
|
|
done \
|
|
done
|
|
|
|
# ar: Arabic
|
|
# bg: Bulgarian
|
|
# de: German
|
|
# el: Greek
|
|
# en: English
|
|
# es: Spanish
|
|
# fr: French
|
|
# hi: Hindi
|
|
# ru: Russian
|
|
# sw: Swahili
|
|
# th: Thai
|
|
# tr: Turkish
|
|
# ur: Urdu
|
|
# vi: Vietnamese
|
|
# zh: Chinese (Simplified)
|
|
|
|
xnli-wikidocs:
|
|
for l in ar bg de el en es fr hi ru sw th tr ur vi zh; do \
|
|
for w in ${WIKISOURCES}; do \
|
|
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
|
|
done \
|
|
done
|
|
|
|
## en and es are too big to run through udpipe ....
|
|
big-wikidocs:
|
|
for l in en es; do \
|
|
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
|
|
SRC=$$l WIKISOURCE=wiki extract-doc; \
|
|
done
|
|
|
|
big-fr-wikidocs:
|
|
for l in fr; do \
|
|
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
|
|
SRC=$$l WIKISOURCE=wiki extract-doc; \
|
|
done
|
|
|
|
|
|
#big-wikidocs:
|
|
# for l in ca cs el en es; do \
|
|
# ${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
|
|
# SRC=$$l WIKISOURCE=wiki extract-doc; \
|
|
# done
|
|
|
|
|
|
|
|
|
|
|
|
translate-thl:
|
|
${MAKE} WIKI_DIR=thl/${SRC} \
|
|
OUTPUT_DIR=thl/${SRC}-${TRG} \
|
|
WIKISOURCE=thl \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fetch-celtic:
|
|
for l in ga cy br gd kw gv; do \
|
|
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikitext; \
|
|
done
|
|
|
|
## translate celtic languages using our multilingual model
|
|
## in both directions
|
|
translate-celtic-english:
|
|
for l in ga cy br gd kw gv; do \
|
|
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis; \
|
|
done
|
|
|
|
translate-english-celtic:
|
|
for l in ga cy br gd kw gv; do \
|
|
${MAKE} TRG=$$l SRC=en \
|
|
MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
|
|
MULTI_TARGET_MODEL=1 \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
|
|
done
|
|
|
|
|
|
translate-english-celtic-missing:
|
|
for l in gd; do \
|
|
${MAKE} TRG=$$l SRC=en \
|
|
MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
|
|
MULTI_TARGET_MODEL=1 \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
|
|
done
|
|
|
|
|
|
|
|
# test-celtic:
|
|
# for l in ga cy br gd kw gv; do \
|
|
# ${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en print-modelname; \
|
|
# done
|
|
|
|
|
|
|
|
## for Breton: use the multilingual celtic model to backtranslate
|
|
breton:
|
|
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis
|
|
|
|
|
|
assamese-english:
|
|
${MAKE} SRC=as TRG=en MODELHOME=${HOME}/research/Opus-MT-train/work/models/as-en all-wikis
|
|
|
|
english-assamese:
|
|
${MAKE} SRC=en TRG=as MODELHOME=${HOME}/research/Opus-MT-train/work/models/en-as translate.submit
|
|
|
|
|
|
|
|
|
|
|
|
small-romance:
|
|
for l in wa frp oc ca rm lld fur lij lmo gl lad an mwl co nap scn vec sc la; do \
|
|
${MAKE} SRC=$$l TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en all-wikis; \
|
|
done
|
|
|
|
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
|
|
wikimedia-focus-wikis:
|
|
for l in tl bcl ml bn mn; do \
|
|
${MAKE} SRC=$$l TRG=en all-wikis; \
|
|
done
|
|
|
|
finland-focus-wikis:
|
|
for l in ru et so ku fa sq vi th pl tr es ar; do \
|
|
${MAKE} SRC=$$l TRG=fi all-wikitext; \
|
|
done
|
|
|
|
|
|
uralic-wiki-texts:
|
|
for l in se kv vep; do \
|
|
${MAKE} SRC=$$l TRG=en all-wikitext; \
|
|
done
|
|
|
|
|
|
# should be included: vep
|
|
|
|
uralic-wikis:
|
|
for s in se kv vep; do \
|
|
for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep++et+fi+kv+krl+nb+no+nn+ru+sv+en \
|
|
all-wikis; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
|
|
# fetch sami corpora from giellatekno
|
|
|
|
sami-corp:
|
|
for l in sme sma smn sms smj; do \
|
|
${MAKE} SRC=$$l giellatekno/$$l/corp.$$l.aa.gz; \
|
|
done
|
|
|
|
giellatekno/${SRC}/corp.${SRC}.aa.gz:
|
|
${MAKE} victorio.uit.no/biggies/trunk/langs/${SRC}
|
|
mkdir -p ${dir $@}
|
|
find victorio.uit.no/biggies/trunk/langs/${SRC}/corp -type f -regex '.*/[^.]*.txt' |\
|
|
xargs cat | grep . | sed 's/ ¶//' |\
|
|
$(TOKENIZER)/detokenizer.perl -l fi | \
|
|
split -l ${SPLIT_SIZE} - giellatekno/${SRC}/corp.${SRC}.
|
|
gzip -f giellatekno/${SRC}/corp.${SRC}.*
|
|
|
|
victorio.uit.no/biggies/trunk/langs/${SRC}:
|
|
${WGET} -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp
|
|
|
|
giellatekno/se: giellatekno/sme
|
|
-cd giellatekno && ln -s sme se
|
|
-cd giellatekno/sme && ln -s corp.sme.aa.gz corp.se.aa.gz
|
|
|
|
# cleanup-uralic:
|
|
# for s in se sma smn sms smj kv krl vep; do \
|
|
# rm -fr $$s-$$s; \
|
|
# done
|
|
|
|
|
|
translate-sami: translate-sami-wiki translate-sami-corp
|
|
|
|
translate-sami-corp: sami-corp giellatekno/se
|
|
for s in se sma smn sms smj; do \
|
|
for t in se sma smn sms smj et fi kv krl nb no nn ru sv en; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
WIKI_DIR=giellatekno/$$s \
|
|
WIKISOURCE=corp \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
translate-sami-wiki:
|
|
for s in se; do \
|
|
for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit; \
|
|
fi \
|
|
done \
|
|
done
|
|
for s in no nn ru sv en; do \
|
|
for t in se sma smn sms smj; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
|
|
|
|
### NEWNEWNEW
|
|
|
|
translate-sami-xx-wiki:
|
|
for s in se; do \
|
|
for t in sma smn sms smj fi no sv; do \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit; \
|
|
done \
|
|
done
|
|
|
|
|
|
translate-sami-xx-corp: sami-corp giellatekno/se
|
|
for s in se sma smn sms smj; do
|
|
for t in fi no sv; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
WIKI_DIR=giellatekno/$$s \
|
|
WIKISOURCE=corp \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
translate-xx-sami-wiki:
|
|
for s in fi no nn sv; do \
|
|
for t in se sma smn sms smj; do \
|
|
${MAKE} SRC=$$s TRG=$$t \
|
|
MULTI_TARGET_MODEL=1 \
|
|
MODELHOME=${HOME}/research/Opus-MT-train/models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms \
|
|
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
|
|
translate.submit; \
|
|
done \
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get-data: ${WIKI_JSON}
|
|
extract-text: ${WIKI_TXT}
|
|
extract-doc: ${WIKI_DOC}
|
|
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
prepare-data: ${WIKI_PRE}
|
|
translate: ${WIKI_LATEST_TRG}
|
|
${MAKE} ${WIKI_LATEST_SRC}
|
|
|
|
## translate all parts
|
|
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
for p in ${PARTS}; do \
|
|
${MAKE} PART=$$p translate; \
|
|
done
|
|
|
|
## create jobs for translating all parts
|
|
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
for p in ${PARTS}; do \
|
|
${MAKE} PART=$$p translate.submit; \
|
|
done
|
|
|
|
print-names:
|
|
echo ${LANGNAME}
|
|
echo ${UDPIPE_MODEL}
|
|
echo ${WIKI_JSON}
|
|
echo ${MODELNAME}
|
|
|
|
|
|
## fetch the latest model
|
|
## ---> TODO: should we fetch from ObjectStorage instead?
|
|
|
|
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
|
ifneq (${MODELZIP},)
|
|
mkdir -p ${dir $@}
|
|
cp ${MODELZIP} ${dir $@}
|
|
cd ${dir $@} && unzip *.zip
|
|
endif
|
|
|
|
|
|
## pre-process data
|
|
## ---> TODO: does that work for multilingual data that need prefix?
|
|
|
|
ifeq (${MULTI_TARGET_MODEL},1)
|
|
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
|
|
else
|
|
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
|
|
endif
|
|
|
|
${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${SRC}.${PART}.gz
|
|
ifneq (${MODELZIP},)
|
|
mkdir -p ${dir $@}
|
|
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
${GZCAT} $< |\
|
|
grep -v '[<>{}]' |\
|
|
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
|
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
|
head -${MAX_SENTENCES} |\
|
|
gzip -c > $@
|
|
endif
|
|
|
|
|
|
|
|
## merge SentencePiece segments in the source text
|
|
## (Why? because we filter out some data from the original wiki text, see above)
|
|
|
|
${WIKI_SRC}: ${WIKI_PRE}
|
|
ifneq (${MODELZIP},)
|
|
mkdir -p ${dir $@}
|
|
${GZCAT} $< |\
|
|
sed 's/ //g;s/▁/ /g' | \
|
|
sed 's/^ *//;s/ *$$//' |\
|
|
sed 's/^>>[a-z]*<< //' |\
|
|
gzip -c > $@
|
|
endif
|
|
|
|
|
|
|
|
## remove prefix from latest translation files
|
|
|
|
ALL_LATEST = ${wildcard */latest/*.gz}
|
|
|
|
fix-prefix:
|
|
for d in ${ALL_LATEST}; do \
|
|
echo "fix $$d"; \
|
|
${ZCAT} $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
|
|
cat $$d.fixed | gzip -c > $$d; \
|
|
rm -f $$d.fixed; \
|
|
done
|
|
|
|
|
|
## overwrite the file with the latest translations
|
|
## --> this allows multiple translation iterations
|
|
## without duplicating the data we want to use in MT training
|
|
|
|
${WIKI_LATEST_SRC}: ${WIKI_SRC}
|
|
mkdir -p ${dir $@}
|
|
cp $< $@
|
|
|
|
${WIKI_LATEST_TRG}: ${WIKI_TRG}
|
|
mkdir -p ${dir $@}
|
|
cp $< $@
|
|
|
|
|
|
|
|
## translate
|
|
|
|
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
|
|
ifneq (${MODELZIP},)
|
|
mkdir -p ${dir $@}
|
|
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
|
|
-i ${PWD}/$< \
|
|
-c decoder.yml \
|
|
-d ${MARIAN_GPUS} \
|
|
${MARIAN_DECODER_FLAGS} |\
|
|
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
|
gzip -c > ${PWD}/$@
|
|
#ifneq (${LANGPAIR},)
|
|
#ifneq (${MODELNAME},)
|
|
# rm -fr ${LANGPAIR}/${MODELNAME}
|
|
#endif
|
|
#endif
|
|
endif
|
|
|
|
|
|
## index of all downloadable files
|
|
index.html:
|
|
${WGET} -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
|
|
|
|
## wiki in json format
|
|
${WIKI_JSON}:
|
|
${WGET} -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
|
|
|
|
|
|
|
|
|
|
## languages with nonbreaking prefix files
|
|
## (i.e. support for the Moses sentence splitter)
|
|
# ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta yue zh
|
|
|
|
# MOSES_LANGS = ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta yue zh
|
|
MOSES_LANGS = ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta
|
|
|
|
## check whether there is a UDPipe model
|
|
## and LANGID is not supported by moses tools (they are much faster!)
|
|
|
|
ifneq (${UDPIPE_MODEL},)
|
|
ifneq ($(filter-out ${MOSES_LANGS},${LANGID}),)
|
|
SENTSPLITTER = udpipe --input=horizontal --tokenize \
|
|
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
|
|
grep '^\# *text *= *\|\# newpar' |\
|
|
sed 's/^\# *text *= *//'
|
|
endif
|
|
endif
|
|
|
|
## fallback = moses tools
|
|
SENTSPLITTER ?= sed 's/^ *$$/\# newpar/' | \
|
|
${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID} |\
|
|
sed -e "s/\# newpar/\n\# newpar\n/g"
|
|
|
|
|
|
## extract sentences and normalize
|
|
## - requires jq, udpipe, and moses-scripts
|
|
${WIKI_TXT}: ${WIKI_JSON}
|
|
mkdir -p ${dir $@}
|
|
${LOAD_MODULES} \
|
|
${GZCAT} $< | ${JQ} -r '.text' | \
|
|
grep -v 'null' |\
|
|
grep -v '[<>{}]' |\
|
|
${SENTSPLITTER} |\
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
|
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
|
|
${SORT} -u | ${SHUFFLE} |\
|
|
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
|
|
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
|
|
|
|
|
|
|
# $(TOKENIZER)/normalize-punctuation.perl |\
|
|
|
|
|
|
${WIKI_DOC}: ${WIKI_JSON}
|
|
mkdir -p ${dir $@}
|
|
${LOAD_MODULES} \
|
|
${GZCAT} $< | ${JQ} -r '.text' | \
|
|
sed 's/^ *null *$$//' |\
|
|
grep -v '[<>{}]' |\
|
|
${SENTSPLITTER} |\
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
|
sed 's/^# newpar$$//' |\
|
|
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
|
|
gzip -c > $@
|
|
|
|
|
|
|
|
check-length:
|
|
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
|
s=`echo $$d | cut -f1 -d'-'`; \
|
|
t=`echo $$d | cut -f2 -d'-'`; \
|
|
echo "check $$d"; \
|
|
for S in `ls $$d/*.$$s.gz`; do \
|
|
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
|
|
echo "$$S -- $$T"; \
|
|
${GZCAT} $$S | wc -l; \
|
|
${GZCAT} $$T | wc -l; \
|
|
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
|
|
echo "$$S != $$T"; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## OLD: without splitting into parts
|
|
#
|
|
# ${WIKI_TXT}: ${WIKI_JSON}
|
|
# ${LOAD_MODULES} \
|
|
# ${ZCAT} $< | ${JQ} -r '.text' | \
|
|
# grep -v 'null' |\
|
|
# ${SENTSPLITTER} |\
|
|
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
|
# $(TOKENIZER)/normalize-punctuation.perl |\
|
|
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
|
# gzip -c > $@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# afrikaans-afribooms-ud-2.4-190531.udpipe af
|
|
# ancient_greek-perseus-ud-2.4-190531.udpipe
|
|
# ancient_greek-proiel-ud-2.4-190531.udpipe
|
|
# arabic-padt-ud-2.4-190531.udpipe ar
|
|
# armenian-armtdp-ud-2.4-190531.udpipe
|
|
# basque-bdt-ud-2.4-190531.udpipe eo
|
|
# belarusian-hse-ud-2.4-190531.udpipe
|
|
# bulgarian-btb-ud-2.4-190531.udpipe bg
|
|
# catalan-ancora-ud-2.4-190531.udpipe ca
|
|
# chinese-gsd-ud-2.4-190531.udpipe zh
|
|
# classical_chinese-kyoto-ud-2.4-190531.udpipe zh_tw
|
|
# coptic-scriptorium-ud-2.4-190531.udpipe
|
|
# croatian-set-ud-2.4-190531.udpipe hr
|
|
# czech-cac-ud-2.4-190531.udpipe cs
|
|
# czech-cltt-ud-2.4-190531.udpipe cs
|
|
# czech-fictree-ud-2.4-190531.udpipe cs
|
|
# czech-pdt-ud-2.4-190531.udpipe cs
|
|
# danish-ddt-ud-2.4-190531.udpipe da
|
|
# dutch-alpino-ud-2.4-190531.udpipe nl
|
|
# dutch-lassysmall-ud-2.4-190531.udpipe nl
|
|
# english-ewt-ud-2.4-190531.udpipe en
|
|
# english-gum-ud-2.4-190531.udpipe en
|
|
# english-lines-ud-2.4-190531.udpipe en
|
|
# english-partut-ud-2.4-190531.udpipe en
|
|
# estonian-edt-ud-2.4-190531.udpipe et
|
|
# estonian-ewt-ud-2.4-190531.udpipe et
|
|
# finnish-ftb-ud-2.4-190531.udpipe fi
|
|
# finnish-tdt-ud-2.4-190531.udpipe fi
|
|
# french-gsd-ud-2.4-190531.udpipe fr
|
|
# french-partut-ud-2.4-190531.udpipe fr
|
|
# french-sequoia-ud-2.4-190531.udpipe fr
|
|
# french-spoken-ud-2.4-190531.udpipe fr
|
|
# galician-ctg-ud-2.4-190531.udpipe gl
|
|
# galician-treegal-ud-2.4-190531.udpipe gl
|
|
# german-gsd-ud-2.4-190531.udpipe de
|
|
# gothic-proiel-ud-2.4-190531.udpipe
|
|
# greek-gdt-ud-2.4-190531.udpipe el
|
|
# hebrew-htb-ud-2.4-190531.udpipe he
|
|
# hindi-hdtb-ud-2.4-190531.udpipe hi
|
|
# hungarian-szeged-ud-2.4-190531.udpipe hu
|
|
# indonesian-gsd-ud-2.4-190531.udpipe id
|
|
# irish-idt-ud-2.4-190531.udpipe cy
|
|
# italian-isdt-ud-2.4-190531.udpipe it
|
|
# italian-partut-ud-2.4-190531.udpipe it
|
|
# italian-postwita-ud-2.4-190531.udpipe it
|
|
# italian-vit-ud-2.4-190531.udpipe it
|
|
# japanese-gsd-ud-2.4-190531.udpipe ja
|
|
# korean-gsd-ud-2.4-190531.udpipe ko
|
|
# korean-kaist-ud-2.4-190531.udpipe ko
|
|
# latin-ittb-ud-2.4-190531.udpipe la
|
|
# latin-perseus-ud-2.4-190531.udpipe la
|
|
# latin-proiel-ud-2.4-190531.udpipe la
|
|
# latvian-lvtb-ud-2.4-190531.udpipe lv
|
|
# lithuanian-alksnis-ud-2.4-190531.udpipe lt
|
|
# lithuanian-hse-ud-2.4-190531.udpipe lt
|
|
# maltese-mudt-ud-2.4-190531.udpipe mt
|
|
# marathi-ufal-ud-2.4-190531.udpipe
|
|
# north_sami-giella-ud-2.4-190531.udpipe
|
|
# norwegian-bokmaal-ud-2.4-190531.udpipe nb
|
|
# norwegian-nynorsklia-ud-2.4-190531.udpipe nn
|
|
# norwegian-nynorsk-ud-2.4-190531.udpipe nn
|
|
# old_church_slavonic-proiel-ud-2.4-190531.udpipe
|
|
# old_french-srcmf-ud-2.4-190531.udpipe
|
|
# old_russian-torot-ud-2.4-190531.udpipe
|
|
# persian-seraji-ud-2.4-190531.udpipe fa
|
|
# polish-lfg-ud-2.4-190531.udpipe pl
|
|
# polish-pdb-ud-2.4-190531.udpipe pl
|
|
# portuguese-bosque-ud-2.4-190531.udpipe pt
|
|
# portuguese-gsd-ud-2.4-190531.udpipe pt
|
|
# romanian-nonstandard-ud-2.4-190531.udpipe ro
|
|
# romanian-rrt-ud-2.4-190531.udpipe ro
|
|
# russian-gsd-ud-2.4-190531.udpipe ru
|
|
# russian-syntagrus-ud-2.4-190531.udpipe ru
|
|
# russian-taiga-ud-2.4-190531.udpipe ru
|
|
# serbian-set-ud-2.4-190531.udpipe sr
|
|
# slovak-snk-ud-2.4-190531.udpipe sk
|
|
# slovenian-ssj-ud-2.4-190531.udpipe sl
|
|
# slovenian-sst-ud-2.4-190531.udpipe sl
|
|
# spanish-ancora-ud-2.4-190531.udpipe es
|
|
# spanish-gsd-ud-2.4-190531.udpipe es
|
|
# swedish-lines-ud-2.4-190531.udpipe sv
|
|
# swedish-talbanken-ud-2.4-190531.udpipe sv
|
|
# tamil-ttb-ud-2.4-190531.udpipe
|
|
# telugu-mtg-ud-2.4-190531.udpipe
|
|
# turkish-imst-ud-2.4-190531.udpipe tr
|
|
# ukrainian-iu-ud-2.4-190531.udpipe
|
|
# urdu-udtb-ud-2.4-190531.udpipe
|
|
# uyghur-udt-ud-2.4-190531.udpipe
|
|
# vietnamese-vtb-ud-2.4-190531.udpipe vi
|
|
# wolof-wtb-ud-2.4-190531.udpipe
|
|
|
|
|
|
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-content.json.gz
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-general.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-content.json.gz
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-general.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-content.json.gz
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-general.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-content.json.gz
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-general.json.gz
|
|
|
|
|
|
# enwiki-20191209-cirrussearch-content.json.gz 10-Dec-2019 11:04 22622822308
|
|
# enwiki-20191209-cirrussearch-content.json.gz.tmp 10-Dec-2019 10:57 21460369408
|
|
# enwiki-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:22 50406322974
|
|
# enwiki-20191209-cirrussearch-general.json.gz.tmp 10-Dec-2019 15:50 44720914432
|
|
# enwikibooks-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:24 319454731
|
|
# enwikibooks-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:25 97206925
|
|
# enwikinews-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:26 53746769
|
|
# enwikinews-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:36 364098656
|
|
# enwikiquote-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:38 234637326
|
|
# enwikiquote-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:38 66848855
|
|
# enwikisource-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:09 5236203374
|
|
# enwikisource-20191209-cirrussearch-content.json..> 10-Dec-2019 17:06 4597481472
|
|
# enwikisource-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:11 152492247
|
|
# enwikiversity-20191209-cirrussearch-content.jso..> 10-Dec-2019 17:12 145288148
|
|
# enwikiversity-20191209-cirrussearch-general.jso..> 10-Dec-2019 17:13 193051475
|
|
# enwikivoyage-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:14 179134384
|
|
# enwikivoyage-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:15 99357806
|
|
# enwiktionary-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:36 2319801836
|
|
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23 918503424
|
|
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42 848846623
|
|
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40 661585920
|
|
|
|
|