OPUS-MT-train/backtranslate/Makefile
2022-03-17 21:02:11 +02:00

942 lines
28 KiB
Makefile

#
# backtranslate wiki data
#
# only works with sentencepiece models!
#
PWD := ${shell pwd}
REPOHOME := ${PWD}/../
TOOLSDIR := ${REPOHOME}tools
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
SRC ?= af
TRG ?= en
## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE ?= wiki
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
PART ?= aa
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH ?= 100
MAX_SENTENCES ?= ${SPLIT_SIZE}
LANGPAIR = ${SRC}-${TRG}
PWD := $(shell pwd)
MODELSDIR ?= ../models
MODELHOME ?= ${MODELSDIR}/${LANGPAIR}
## standard sort is different from UTF8-based sort
## --> prefer models with augmented data sets (separated by +)
## we need the UTF8 sort order
## --> use bash sort and UTF8 locale
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
ifneq (${wildcard ${MODELHOME}},)
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
ifeq (${MODELNAME},)
ifneq (${wildcard ${WORKHOME/models/${LANGPAIR}}},)
MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
endif
## set to 1 if the model for backtranslation is a multi-target model
## --> need to use pre-processing script differently
ifeq ($(words $(subst +, ,$(lastword $(subst -, ,$(notdir ${MODELHOME}))))),1)
MULTI_TARGET_MODEL = 0
else
MULTI_TARGET_MODEL = 1
endif
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
ifeq (${shell hostname --domain 2>/dev/null},bullx)
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
endif
ifneq (${wildcard index.html},)
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
endif
LANGID = ${SRC}
WIKI_HOME = wiki
WIKIDOC_HOME = wikidoc
WIKI_DIR = ${WIKI_HOME}/${LANGID}
OUTPUT_DIR = ${LANGPAIR}
WIKI_TXT = ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.${PART}.gz
WIKI_DOC = ${WIKIDOC_HOME}/${LANGID}/${WIKISOURCE}.${LANGID}.gz
WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
## find wiki downloads
ifneq (${wildcard index.html},)
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
endif
## we don't need to keep the json file
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}
## find UDPipe model
ifndef UDPIPE_MODELS
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
endif
LANGNAME = ${shell ${LOAD_MODULES} ${ISO639} -n ${LANGID} | sed 's/"//g' | \
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
ifeq (${LANGNAME},)
LANGNAME = xx
endif
ifneq (${wildcard ${UDPIPE_MODELS}},)
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
endif
all: index.html
${MAKE} ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_SRC}
## store wikidata on allas to make them accessible for everyone
## requires configuration for allas project OPUS-MT
store-wikidocs:
cd wikidoc && a-put -b OPUS-MT-bt-wikidoc --nc --follow-links --override *
swift post OPUS-MT-bt-wikidoc --read-acl ".r:*"
store-wiki:
cd wiki && a-put -b OPUS-MT-bt-wiki --nc --follow-links --override *
swift post OPUS-MT-bt-wiki --read-acl ".r:*"
fetch-wiki fetch:
mkdir -p wiki
${WGET} -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
tar -C wiki -xf wiki/${SRC}.tar
rm -f wiki/${SRC}.tar
fetch-wikidoc:
mkdir -p wikidoc
${WGET} -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
tar -C wikidoc -xf wikidoc/${SRC}.tar
rm -f wikidoc/${SRC}.tar
## tatoeba = tatoeba-monolingual data and tatoeba-models
## TODO: should we loop over all labels?
%-tatoeba:
${MAKE} WIKI_HOME=../work-tatoeba/data/mono \
WIKISOURCES="wikipedia wikibooks wikinews wikiquote wikisource" \
MODELSDIR=../models-tatoeba \
${@:-tatoeba=}
# %-tatoeba:
# ${MAKE} WIKI_HOME=wiki-iso639-3 \
# WIKIDOC_HOME=wikidoc-iso639-3 \
# MODELSDIR=../models-tatoeba \
# ${@:-tatoeba=}
## make ISO639-3 conform file links
wiki-iso639:
for l in ${WIKILANGS}; do \
i=`iso639 -3 -n $$l`; \
mkdir -p wiki-iso639-3/$$i; \
for d in `ls wiki/$$l/*.gz`; do \
ln -s ${PWD}/$$d wiki-iso639-3/$$i/`basename $$d | sed "s/\.$$l\.\(..\.gz\)/.$$i.\1/"`; \
done \
done
wiki-iso639-doc:
for l in ${WIKILANGS}; do \
i=`iso639 -3 -n $$l`; \
mkdir -p wikidoc-iso639-3/$$i; \
for d in `ls wikidoc/$$l/*.gz`; do \
ln -s ${PWD}/$$d wikidoc-iso639-3/$$i/`basename $$d | sed "s/\.$$l\.\(..\.gz\)/.$$i.\1/"`; \
done \
done
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
.PHONY: translate-all-wikis
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate"; \
${MAKE} WIKISOURCE=$$w translate; \
fi \
done
.PHONY: translate-all-wikiparts
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
${MAKE} WIKISOURCE=$$w translate-all-parts; \
fi \
done
translate-all-wikis-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate"; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
fi \
done
translate-all-wikiparts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate-all-parts-jobs; \
fi \
done
all-wikitext:
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
done
all-wikilangs: index.html
for l in ${WIKILANGS}; do \
${MAKE} LANGID=$$l extract-text; \
done
all-wikilangs-fast: index.html
for l in ${WIKILANGS}; do \
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
LANGID=$$l extract-text; \
done
all-wikis-all-langs: index.html
for l in ${WIKILANGS}; do \
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w LANGID=$$l extract-text; \
done \
done
## aux function to print the selected modelname
.PHONY: print-modelname
print-modelname:
@echo ${MODELNAME}
@echo ${MODELZIP}
@echo "${sort ${wildcard ${MODELHOME}/*-20*.zip}}"
all-wikidocs-all-langs: index.html
for l in ${WIKILANGS}; do \
for w in ${WIKISOURCES}; do \
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
done \
done
## nordic language wikis
all-nordic-wikidocs:
for l in da et fi fo is nn no sv; do \
for w in ${WIKISOURCES}; do \
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
done \
done
# ar: Arabic
# bg: Bulgarian
# de: German
# el: Greek
# en: English
# es: Spanish
# fr: French
# hi: Hindi
# ru: Russian
# sw: Swahili
# th: Thai
# tr: Turkish
# ur: Urdu
# vi: Vietnamese
# zh: Chinese (Simplified)
xnli-wikidocs:
for l in ar bg de el en es fr hi ru sw th tr ur vi zh; do \
for w in ${WIKISOURCES}; do \
${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
done \
done
## en and es are too big to run through udpipe ....
big-wikidocs:
for l in en es; do \
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
SRC=$$l WIKISOURCE=wiki extract-doc; \
done
big-fr-wikidocs:
for l in fr; do \
${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
SRC=$$l WIKISOURCE=wiki extract-doc; \
done
#big-wikidocs:
# for l in ca cs el en es; do \
# ${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
# SRC=$$l WIKISOURCE=wiki extract-doc; \
# done
translate-thl:
${MAKE} WIKI_DIR=thl/${SRC} \
OUTPUT_DIR=thl/${SRC}-${TRG} \
WIKISOURCE=thl \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit
fetch-celtic:
for l in ga cy br gd kw gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikitext; \
done
## translate celtic languages using our multilingual model
## in both directions
translate-celtic-english:
for l in ga cy br gd kw gv; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis; \
done
translate-english-celtic:
for l in ga cy br gd kw gv; do \
${MAKE} TRG=$$l SRC=en \
MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
MULTI_TARGET_MODEL=1 \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
done
translate-english-celtic-missing:
for l in gd; do \
${MAKE} TRG=$$l SRC=en \
MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
MULTI_TARGET_MODEL=1 \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
done
# test-celtic:
# for l in ga cy br gd kw gv; do \
# ${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en print-modelname; \
# done
## for Breton: use the multilingual celtic model to backtranslate
breton:
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis
assamese-english:
${MAKE} SRC=as TRG=en MODELHOME=${HOME}/research/Opus-MT-train/work/models/as-en all-wikis
english-assamese:
${MAKE} SRC=en TRG=as MODELHOME=${HOME}/research/Opus-MT-train/work/models/en-as translate.submit
small-romance:
for l in wa frp oc ca rm lld fur lij lmo gl lad an mwl co nap scn vec sc la; do \
${MAKE} SRC=$$l TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en all-wikis; \
done
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
wikimedia-focus-wikis:
for l in tl bcl ml bn mn; do \
${MAKE} SRC=$$l TRG=en all-wikis; \
done
finland-focus-wikis:
for l in ru et so ku fa sq vi th pl tr es ar; do \
${MAKE} SRC=$$l TRG=fi all-wikitext; \
done
uralic-wiki-texts:
for l in se kv vep; do \
${MAKE} SRC=$$l TRG=en all-wikitext; \
done
# should be included: vep
uralic-wikis:
for s in se kv vep; do \
for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep++et+fi+kv+krl+nb+no+nn+ru+sv+en \
all-wikis; \
fi \
done \
done
# fetch sami corpora from giellatekno
sami-corp:
for l in sme sma smn sms smj; do \
${MAKE} SRC=$$l giellatekno/$$l/corp.$$l.aa.gz; \
done
giellatekno/${SRC}/corp.${SRC}.aa.gz:
${MAKE} victorio.uit.no/biggies/trunk/langs/${SRC}
mkdir -p ${dir $@}
find victorio.uit.no/biggies/trunk/langs/${SRC}/corp -type f -regex '.*/[^.]*.txt' |\
xargs cat | grep . | sed 's/ ¶//' |\
$(TOKENIZER)/detokenizer.perl -l fi | \
split -l ${SPLIT_SIZE} - giellatekno/${SRC}/corp.${SRC}.
gzip -f giellatekno/${SRC}/corp.${SRC}.*
victorio.uit.no/biggies/trunk/langs/${SRC}:
${WGET} -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp
giellatekno/se: giellatekno/sme
-cd giellatekno && ln -s sme se
-cd giellatekno/sme && ln -s corp.sme.aa.gz corp.se.aa.gz
# cleanup-uralic:
# for s in se sma smn sms smj kv krl vep; do \
# rm -fr $$s-$$s; \
# done
translate-sami: translate-sami-wiki translate-sami-corp
translate-sami-corp: sami-corp giellatekno/se
for s in se sma smn sms smj; do \
for t in se sma smn sms smj et fi kv krl nb no nn ru sv en; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRC=$$s TRG=$$t \
WIKI_DIR=giellatekno/$$s \
WIKISOURCE=corp \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
fi \
done \
done
translate-sami-wiki:
for s in se; do \
for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
fi \
done \
done
for s in no nn ru sv en; do \
for t in se sma smn sms smj; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
fi \
done \
done
### NEWNEWNEW
translate-sami-xx-wiki:
for s in se; do \
for t in sma smn sms smj fi no sv; do \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
done \
done
translate-sami-xx-corp: sami-corp giellatekno/se
for s in se sma smn sms smj; do
for t in fi no sv; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} SRC=$$s TRG=$$t \
WIKI_DIR=giellatekno/$$s \
WIKISOURCE=corp \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
fi \
done \
done
translate-xx-sami-wiki:
for s in fi no nn sv; do \
for t in se sma smn sms smj; do \
${MAKE} SRC=$$s TRG=$$t \
MULTI_TARGET_MODEL=1 \
MODELHOME=${HOME}/research/Opus-MT-train/models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms \
HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
translate.submit; \
done \
done
get-data: ${WIKI_JSON}
extract-text: ${WIKI_TXT}
extract-doc: ${WIKI_DOC}
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
prepare-data: ${WIKI_PRE}
translate: ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_SRC}
## translate all parts
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
${MAKE} PART=$$p translate; \
done
## create jobs for translating all parts
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
${MAKE} PART=$$p translate.submit; \
done
print-names:
echo ${LANGNAME}
echo ${UDPIPE_MODEL}
echo ${WIKI_JSON}
echo ${MODELNAME}
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
cp ${MODELZIP} ${dir $@}
cd ${dir $@} && unzip *.zip
endif
## pre-process data
## ---> TODO: does that work for multilingual data that need prefix?
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
else
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif
${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${SRC}.${PART}.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${GZCAT} $< |\
grep -v '[<>{}]' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
head -${MAX_SENTENCES} |\
gzip -c > $@
endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
gzip -c > $@
endif
## remove prefix from latest translation files
ALL_LATEST = ${wildcard */latest/*.gz}
fix-prefix:
for d in ${ALL_LATEST}; do \
echo "fix $$d"; \
${ZCAT} $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
cat $$d.fixed | gzip -c > $$d; \
rm -f $$d.fixed; \
done
## overwrite the file with the latest translations
## --> this allows multiple translation iterations
## without duplicating the data we want to use in MT training
${WIKI_LATEST_SRC}: ${WIKI_SRC}
mkdir -p ${dir $@}
cp $< $@
${WIKI_LATEST_TRG}: ${WIKI_TRG}
mkdir -p ${dir $@}
cp $< $@
## translate
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
-i ${PWD}/$< \
-c decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
#ifneq (${LANGPAIR},)
#ifneq (${MODELNAME},)
# rm -fr ${LANGPAIR}/${MODELNAME}
#endif
#endif
endif
## index of all downloadable files
index.html:
${WGET} -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
## wiki in json format
${WIKI_JSON}:
${WGET} -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
## languages with nonbreaking prefix files
## (i.e. support for the Moses sentence splitter)
# ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta yue zh
# MOSES_LANGS = ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta yue zh
MOSES_LANGS = ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta
## check whether there is a UDPipe model
## and LANGID is not supported by moses tools (they are much faster!)
ifneq (${UDPIPE_MODEL},)
ifneq ($(filter-out ${MOSES_LANGS},${LANGID}),)
SENTSPLITTER = udpipe --input=horizontal --tokenize \
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
grep '^\# *text *= *\|\# newpar' |\
sed 's/^\# *text *= *//'
endif
endif
## fallback = moses tools
SENTSPLITTER ?= sed 's/^ *$$/\# newpar/' | \
${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID} |\
sed -e "s/\# newpar/\n\# newpar\n/g"
## extract sentences and normalize
## - requires jq, udpipe, and moses-scripts
${WIKI_TXT}: ${WIKI_JSON}
mkdir -p ${dir $@}
${LOAD_MODULES} \
${GZCAT} $< | ${JQ} -r '.text' | \
grep -v 'null' |\
grep -v '[<>{}]' |\
${SENTSPLITTER} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
${SORT} -u | ${SHUFFLE} |\
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}*
# $(TOKENIZER)/normalize-punctuation.perl |\
${WIKI_DOC}: ${WIKI_JSON}
mkdir -p ${dir $@}
${LOAD_MODULES} \
${GZCAT} $< | ${JQ} -r '.text' | \
sed 's/^ *null *$$//' |\
grep -v '[<>{}]' |\
${SENTSPLITTER} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
sed 's/^# newpar$$//' |\
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
gzip -c > $@
check-length:
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
s=`echo $$d | cut -f1 -d'-'`; \
t=`echo $$d | cut -f2 -d'-'`; \
echo "check $$d"; \
for S in `ls $$d/*.$$s.gz`; do \
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
echo "$$S -- $$T"; \
${GZCAT} $$S | wc -l; \
${GZCAT} $$T | wc -l; \
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
echo "$$S != $$T"; \
fi \
done \
done
## OLD: without splitting into parts
#
# ${WIKI_TXT}: ${WIKI_JSON}
# ${LOAD_MODULES} \
# ${ZCAT} $< | ${JQ} -r '.text' | \
# grep -v 'null' |\
# ${SENTSPLITTER} |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/normalize-punctuation.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
# gzip -c > $@
# afrikaans-afribooms-ud-2.4-190531.udpipe af
# ancient_greek-perseus-ud-2.4-190531.udpipe
# ancient_greek-proiel-ud-2.4-190531.udpipe
# arabic-padt-ud-2.4-190531.udpipe ar
# armenian-armtdp-ud-2.4-190531.udpipe
# basque-bdt-ud-2.4-190531.udpipe eo
# belarusian-hse-ud-2.4-190531.udpipe
# bulgarian-btb-ud-2.4-190531.udpipe bg
# catalan-ancora-ud-2.4-190531.udpipe ca
# chinese-gsd-ud-2.4-190531.udpipe zh
# classical_chinese-kyoto-ud-2.4-190531.udpipe zh_tw
# coptic-scriptorium-ud-2.4-190531.udpipe
# croatian-set-ud-2.4-190531.udpipe hr
# czech-cac-ud-2.4-190531.udpipe cs
# czech-cltt-ud-2.4-190531.udpipe cs
# czech-fictree-ud-2.4-190531.udpipe cs
# czech-pdt-ud-2.4-190531.udpipe cs
# danish-ddt-ud-2.4-190531.udpipe da
# dutch-alpino-ud-2.4-190531.udpipe nl
# dutch-lassysmall-ud-2.4-190531.udpipe nl
# english-ewt-ud-2.4-190531.udpipe en
# english-gum-ud-2.4-190531.udpipe en
# english-lines-ud-2.4-190531.udpipe en
# english-partut-ud-2.4-190531.udpipe en
# estonian-edt-ud-2.4-190531.udpipe et
# estonian-ewt-ud-2.4-190531.udpipe et
# finnish-ftb-ud-2.4-190531.udpipe fi
# finnish-tdt-ud-2.4-190531.udpipe fi
# french-gsd-ud-2.4-190531.udpipe fr
# french-partut-ud-2.4-190531.udpipe fr
# french-sequoia-ud-2.4-190531.udpipe fr
# french-spoken-ud-2.4-190531.udpipe fr
# galician-ctg-ud-2.4-190531.udpipe gl
# galician-treegal-ud-2.4-190531.udpipe gl
# german-gsd-ud-2.4-190531.udpipe de
# gothic-proiel-ud-2.4-190531.udpipe
# greek-gdt-ud-2.4-190531.udpipe el
# hebrew-htb-ud-2.4-190531.udpipe he
# hindi-hdtb-ud-2.4-190531.udpipe hi
# hungarian-szeged-ud-2.4-190531.udpipe hu
# indonesian-gsd-ud-2.4-190531.udpipe id
# irish-idt-ud-2.4-190531.udpipe cy
# italian-isdt-ud-2.4-190531.udpipe it
# italian-partut-ud-2.4-190531.udpipe it
# italian-postwita-ud-2.4-190531.udpipe it
# italian-vit-ud-2.4-190531.udpipe it
# japanese-gsd-ud-2.4-190531.udpipe ja
# korean-gsd-ud-2.4-190531.udpipe ko
# korean-kaist-ud-2.4-190531.udpipe ko
# latin-ittb-ud-2.4-190531.udpipe la
# latin-perseus-ud-2.4-190531.udpipe la
# latin-proiel-ud-2.4-190531.udpipe la
# latvian-lvtb-ud-2.4-190531.udpipe lv
# lithuanian-alksnis-ud-2.4-190531.udpipe lt
# lithuanian-hse-ud-2.4-190531.udpipe lt
# maltese-mudt-ud-2.4-190531.udpipe mt
# marathi-ufal-ud-2.4-190531.udpipe
# north_sami-giella-ud-2.4-190531.udpipe
# norwegian-bokmaal-ud-2.4-190531.udpipe nb
# norwegian-nynorsklia-ud-2.4-190531.udpipe nn
# norwegian-nynorsk-ud-2.4-190531.udpipe nn
# old_church_slavonic-proiel-ud-2.4-190531.udpipe
# old_french-srcmf-ud-2.4-190531.udpipe
# old_russian-torot-ud-2.4-190531.udpipe
# persian-seraji-ud-2.4-190531.udpipe fa
# polish-lfg-ud-2.4-190531.udpipe pl
# polish-pdb-ud-2.4-190531.udpipe pl
# portuguese-bosque-ud-2.4-190531.udpipe pt
# portuguese-gsd-ud-2.4-190531.udpipe pt
# romanian-nonstandard-ud-2.4-190531.udpipe ro
# romanian-rrt-ud-2.4-190531.udpipe ro
# russian-gsd-ud-2.4-190531.udpipe ru
# russian-syntagrus-ud-2.4-190531.udpipe ru
# russian-taiga-ud-2.4-190531.udpipe ru
# serbian-set-ud-2.4-190531.udpipe sr
# slovak-snk-ud-2.4-190531.udpipe sk
# slovenian-ssj-ud-2.4-190531.udpipe sl
# slovenian-sst-ud-2.4-190531.udpipe sl
# spanish-ancora-ud-2.4-190531.udpipe es
# spanish-gsd-ud-2.4-190531.udpipe es
# swedish-lines-ud-2.4-190531.udpipe sv
# swedish-talbanken-ud-2.4-190531.udpipe sv
# tamil-ttb-ud-2.4-190531.udpipe
# telugu-mtg-ud-2.4-190531.udpipe
# turkish-imst-ud-2.4-190531.udpipe tr
# ukrainian-iu-ud-2.4-190531.udpipe
# urdu-udtb-ud-2.4-190531.udpipe
# uyghur-udt-ud-2.4-190531.udpipe
# vietnamese-vtb-ud-2.4-190531.udpipe vi
# wolof-wtb-ud-2.4-190531.udpipe
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-general.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-general.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-general.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-general.json.gz
# enwiki-20191209-cirrussearch-content.json.gz 10-Dec-2019 11:04 22622822308
# enwiki-20191209-cirrussearch-content.json.gz.tmp 10-Dec-2019 10:57 21460369408
# enwiki-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:22 50406322974
# enwiki-20191209-cirrussearch-general.json.gz.tmp 10-Dec-2019 15:50 44720914432
# enwikibooks-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:24 319454731
# enwikibooks-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:25 97206925
# enwikinews-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:26 53746769
# enwikinews-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:36 364098656
# enwikiquote-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:38 234637326
# enwikiquote-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:38 66848855
# enwikisource-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:09 5236203374
# enwikisource-20191209-cirrussearch-content.json..> 10-Dec-2019 17:06 4597481472
# enwikisource-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:11 152492247
# enwikiversity-20191209-cirrussearch-content.jso..> 10-Dec-2019 17:12 145288148
# enwikiversity-20191209-cirrussearch-general.jso..> 10-Dec-2019 17:13 193051475
# enwikivoyage-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:14 179134384
# enwikivoyage-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:15 99357806
# enwiktionary-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:36 2319801836
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23 918503424
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42 848846623
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40 661585920