OPUS-MT-train/backtranslate/Makefile

#
# backtranslate wiki data
#
# only works with sentencepiece models!
#

PWD      := ${shell pwd}
REPOHOME := ${PWD}/../
TOOLSDIR := ${REPOHOME}tools

include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk


SRC ?= af
TRG ?= en

## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE ?= wiki

## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
PART       ?= aa

## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH    ?= 100
MAX_SENTENCES ?= ${SPLIT_SIZE}


LANGPAIR = ${SRC}-${TRG}

PWD := $(shell pwd)

MODELSDIR ?= ../models
MODELHOME ?= ${MODELSDIR}/${LANGPAIR}
## standard sort is different from UTF8-based sort
## --> prefer models with augmented data sets (separated by +)
##     we need the UTF8 sort order
## --> use bash sort and UTF8 locale
# MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
ifneq (${wildcard ${MODELHOME}},)
  MODELZIP  = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
  MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif

ifeq (${MODELNAME},)
ifneq (${wildcard ${WORKHOME/models/${LANGPAIR}}},)
  MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
  # MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
  # MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
  MODELZIP  = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
  MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
endif

## set to 1 if the model for backtranslation is a multi-target model
## --> need to use pre-processing script differently
ifeq ($(words $(subst +, ,$(lastword $(subst -, ,$(notdir ${MODELHOME}))))),1)
  MULTI_TARGET_MODEL = 0
else
  MULTI_TARGET_MODEL = 1
endif


ifdef LOCAL_SCRATCH
  TMPDIR = ${LOCAL_SCRATCH}
endif


ifeq (${shell hostname --domain 2>/dev/null},bullx)
  LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
		 module load nlpl-udpipe nlpl-opus &&
endif

ifneq (${wildcard index.html},)
  WIKILANGS = 	${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
		${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
endif


LANGID       = ${SRC}
WIKI_HOME    = wiki
WIKIDOC_HOME = wikidoc
WIKI_DIR     = ${WIKI_HOME}/${LANGID}
OUTPUT_DIR   = ${LANGPAIR}
WIKI_TXT     = ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.${PART}.gz
WIKI_DOC     = ${WIKIDOC_HOME}/${LANGID}/${WIKISOURCE}.${LANGID}.gz
WIKI_SRC     = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE     = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG     = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz

WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz


## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}


## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}

## find wiki downloads
ifneq (${wildcard index.html},)
  WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
endif

## we don't need to keep the json file
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}


## find UDPipe model
ifndef UDPIPE_MODELS
  UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
endif

LANGNAME = ${shell ${LOAD_MODULES} ${ISO639} -n ${LANGID} | sed 's/"//g' | \
		cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}

ifeq (${LANGNAME},)
  LANGNAME = xx
endif

ifneq (${wildcard ${UDPIPE_MODELS}},)
  UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
endif


all: index.html
	${MAKE} ${WIKI_LATEST_TRG}
	${MAKE} ${WIKI_LATEST_SRC}


## store wikidata on allas to make them accessible for everyone
## requires configuration for allas project OPUS-MT

store-wikidocs:
	cd wikidoc && a-put -b OPUS-MT-bt-wikidoc --nc --follow-links --override *
	swift post OPUS-MT-bt-wikidoc --read-acl ".r:*"

store-wiki:
	cd wiki && a-put -b OPUS-MT-bt-wiki --nc --follow-links --override *
	swift post OPUS-MT-bt-wiki --read-acl ".r:*"

fetch-wiki fetch:
	mkdir -p wiki
	${WGET} -O wiki/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wiki/${SRC}.tar
	tar -C wiki -xf wiki/${SRC}.tar
	rm -f wiki/${SRC}.tar

fetch-wikidoc:
	mkdir -p wikidoc
	${WGET} -O wikidoc/${SRC}.tar https://object.pouta.csc.fi/OPUS-MT-bt-wikidoc/${SRC}.tar
	tar -C wikidoc -xf wikidoc/${SRC}.tar
	rm -f wikidoc/${SRC}.tar


## tatoeba = tatoeba-monolingual data and tatoeba-models
## TODO: should we loop over all labels?

%-tatoeba:
	${MAKE}	WIKI_HOME=../work-tatoeba/data/mono \
		WIKISOURCES="wikipedia wikibooks wikinews wikiquote wikisource" \
		MODELSDIR=../models-tatoeba \
	${@:-tatoeba=}

# %-tatoeba:
#	${MAKE}	WIKI_HOME=wiki-iso639-3 \
#		WIKIDOC_HOME=wikidoc-iso639-3 \
#		MODELSDIR=../models-tatoeba \
#	${@:-tatoeba=}


## make ISO639-3 conform file links
wiki-iso639:
	for l in ${WIKILANGS}; do \
	  i=`iso639 -3 -n $$l`; \
	  mkdir -p wiki-iso639-3/$$i; \
	  for d in `ls wiki/$$l/*.gz`; do \
	    ln -s ${PWD}/$$d wiki-iso639-3/$$i/`basename $$d | sed "s/\.$$l\.\(..\.gz\)/.$$i.\1/"`; \
	  done \
	done

wiki-iso639-doc:
	for l in ${WIKILANGS}; do \
	  i=`iso639 -3 -n $$l`; \
	  mkdir -p wikidoc-iso639-3/$$i; \
	  for d in `ls wikidoc/$$l/*.gz`; do \
	    ln -s ${PWD}/$$d wikidoc-iso639-3/$$i/`basename $$d | sed "s/\.$$l\.\(..\.gz\)/.$$i.\1/"`; \
	  done \
	done


# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource

.PHONY: translate-all-wikis
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
	for w in ${WIKISOURCES}; do \
	  ${MAKE} WIKISOURCE=$$w extract-text; \
	  echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
	  if  [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
	    echo "${MAKE} WIKISOURCE=$$w translate"; \
	    ${MAKE} WIKISOURCE=$$w translate; \
	  fi \
	done

.PHONY: translate-all-wikiparts
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
	for w in ${WIKISOURCES}; do \
	  ${MAKE} WIKISOURCE=$$w extract-text; \
	  echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
	  if  [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
	    echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
	    ${MAKE} WIKISOURCE=$$w translate-all-parts; \
	  fi \
	done


translate-all-wikis-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
	for w in ${WIKISOURCES}; do \
	  ${MAKE} WIKISOURCE=$$w extract-text; \
	  echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
	  if  [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
	    echo "${MAKE} WIKISOURCE=$$w translate"; \
	    ${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
	  fi \
	done

translate-all-wikiparts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
	for w in ${WIKISOURCES}; do \
	  ${MAKE} WIKISOURCE=$$w extract-text; \
	  echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
	  if  [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
	    echo "${MAKE} WIKISOURCE=$$w translate-all-parts"; \
	    ${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate-all-parts-jobs; \
	  fi \
	done


all-wikitext:
	for w in ${WIKISOURCES}; do \
	  ${MAKE} WIKISOURCE=$$w extract-text; \
	done

all-wikilangs: index.html
	for l in ${WIKILANGS}; do \
	  ${MAKE} LANGID=$$l extract-text; \
	done

all-wikilangs-fast: index.html
	for l in ${WIKILANGS}; do \
	  ${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
		LANGID=$$l extract-text; \
	done

all-wikis-all-langs: index.html
	for l in ${WIKILANGS}; do \
	  for w in ${WIKISOURCES}; do \
	    ${MAKE} WIKISOURCE=$$w LANGID=$$l extract-text; \
	  done \
	done


## aux function to print the selected modelname
.PHONY: print-modelname
print-modelname:
	@echo ${MODELNAME}
	@echo ${MODELZIP}
	@echo "${sort ${wildcard ${MODELHOME}/*-20*.zip}}"


all-wikidocs-all-langs: index.html
	for l in ${WIKILANGS}; do \
	  for w in ${WIKISOURCES}; do \
	    ${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
	  done \
	done


## nordic language wikis

all-nordic-wikidocs:
	for l in da  et  fi  fo  is  nn  no  sv; do \
	  for w in ${WIKISOURCES}; do \
	    ${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
	  done \
	done

# ar: Arabic
# bg: Bulgarian
# de: German
# el: Greek
# en: English
# es: Spanish
# fr: French
# hi: Hindi
# ru: Russian
# sw: Swahili
# th: Thai
# tr: Turkish
# ur: Urdu
# vi: Vietnamese
# zh: Chinese (Simplified)

xnli-wikidocs:
	for l in ar bg de el en es fr hi ru sw th tr ur vi zh; do \
	  for w in ${WIKISOURCES}; do \
	    ${MAKE} SRC=$$l WIKISOURCE=$$w extract-doc; \
	  done \
	done

## en and es are too big to run through udpipe ....
big-wikidocs:
	for l in en es; do \
	  ${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
		SRC=$$l WIKISOURCE=wiki extract-doc; \
	done

big-fr-wikidocs:
	for l in fr; do \
	  ${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
		SRC=$$l WIKISOURCE=wiki extract-doc; \
	done


#big-wikidocs:
#	for l in ca cs el en es; do \
#	  ${MAKE} SENTSPLITTER="${MOSESSCRIPTS}/ems/support/split-sentences.perl -l $$l" \
#		SRC=$$l WIKISOURCE=wiki extract-doc; \
#	done


translate-thl:
	${MAKE} WIKI_DIR=thl/${SRC} \
		OUTPUT_DIR=thl/${SRC}-${TRG} \
		WIKISOURCE=thl \
		HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	translate.submit


fetch-celtic:
	for l in ga cy br gd kw gv; do \
	  ${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikitext; \
	done

## translate celtic languages using our multilingual model
## in both directions
translate-celtic-english:
	for l in ga cy br gd kw gv; do \
	  ${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis; \
	done

translate-english-celtic:
	for l in ga cy br gd kw gv; do \
	  ${MAKE} TRG=$$l SRC=en \
		MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
		MULTI_TARGET_MODEL=1 \
	  	HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
	done


translate-english-celtic-missing:
	for l in gd; do \
	  ${MAKE} TRG=$$l SRC=en \
		MODELHOME=../models/en-ga+cy+br+gd+kw+gv \
		MULTI_TARGET_MODEL=1 \
	  	HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
	done


# test-celtic:
# 	for l in ga cy br gd kw gv; do \
# 	  ${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en print-modelname; \
# 	done


## for Breton: use the multilingual celtic model to backtranslate
breton:
	${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kw+gv-en all-wikis


assamese-english:
	${MAKE} SRC=as TRG=en MODELHOME=${HOME}/research/Opus-MT-train/work/models/as-en all-wikis

english-assamese:
	${MAKE} SRC=en TRG=as MODELHOME=${HOME}/research/Opus-MT-train/work/models/en-as translate.submit


small-romance:
	for l in wa frp oc ca rm lld fur lij lmo gl lad an mwl co nap scn vec sc la; do \
	  ${MAKE} SRC=$$l TRG=en MODELHOME=../models/fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en all-wikis; \
	done

# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
wikimedia-focus-wikis:
	for l in tl bcl ml bn mn; do \
	  ${MAKE} SRC=$$l TRG=en all-wikis; \
	done

finland-focus-wikis:
	for l in ru et so ku fa sq vi th pl tr es ar; do \
	  ${MAKE} SRC=$$l TRG=fi all-wikitext; \
	done


uralic-wiki-texts:
	for l in se kv vep; do \
	  ${MAKE} SRC=$$l TRG=en all-wikitext; \
	done


# should be included: vep

uralic-wikis:
	for s in se kv vep; do \
	  for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
	    if [ "$$s" != "$$t" ]; then \
	      ${MAKE} SRC=$$s TRG=$$t \
		MULTI_TARGET_MODEL=1 \
		MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep++et+fi+kv+krl+nb+no+nn+ru+sv+en \
		all-wikis; \
	     fi \
	  done \
	done


# fetch sami corpora from giellatekno

sami-corp:
	for l in sme sma smn sms smj; do \
	  ${MAKE} SRC=$$l giellatekno/$$l/corp.$$l.aa.gz; \
	done

giellatekno/${SRC}/corp.${SRC}.aa.gz:
	${MAKE} victorio.uit.no/biggies/trunk/langs/${SRC}
	mkdir -p ${dir $@}
	find victorio.uit.no/biggies/trunk/langs/${SRC}/corp -type f -regex '.*/[^.]*.txt' |\
	xargs cat | grep . | sed 's/ ¶//' |\
	$(TOKENIZER)/detokenizer.perl -l fi | \
	split -l ${SPLIT_SIZE} - giellatekno/${SRC}/corp.${SRC}.
	gzip -f giellatekno/${SRC}/corp.${SRC}.*

victorio.uit.no/biggies/trunk/langs/${SRC}:
	${WGET} -r -np https://victorio.uit.no/biggies/trunk/langs/${SRC}/corp

giellatekno/se: giellatekno/sme
	-cd giellatekno && ln -s sme se
	-cd giellatekno/sme && ln -s corp.sme.aa.gz corp.se.aa.gz

# cleanup-uralic:
# 	for s in se sma smn sms smj kv krl vep; do \
# 	  rm -fr $$s-$$s; \
# 	done


translate-sami: translate-sami-wiki translate-sami-corp

translate-sami-corp: sami-corp giellatekno/se
	for s in se sma smn sms smj; do \
	  for t in se sma smn sms smj et fi kv krl nb no nn ru sv en; do \
	    if [ "$$s" != "$$t" ]; then \
	      ${MAKE} SRC=$$s TRG=$$t \
	      WIKI_DIR=giellatekno/$$s \
	      WIKISOURCE=corp \
	      MULTI_TARGET_MODEL=1 \
	      MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
	      HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	      translate.submit; \
	    fi \
	  done \
	done

translate-sami-wiki:
	for s in se; do \
	  for t in se sma smn sms smj vep et fi kv krl nb no nn ru sv en; do \
	    if [ "$$s" != "$$t" ]; then \
	      ${MAKE} SRC=$$s TRG=$$t \
		MULTI_TARGET_MODEL=1 \
	        MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
	        HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	        translate.submit; \
	     fi \
	  done \
	done
	for s in no nn ru sv en; do \
	  for t in se sma smn sms smj; do \
	    if [ "$$s" != "$$t" ]; then \
	      ${MAKE} SRC=$$s TRG=$$t \
		MULTI_TARGET_MODEL=1 \
	        MODELHOME=${HOME}/research/Opus-MT-train/work/models/se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms+vep+et+fi+kv+krl+nb+no+nn+ru+sv+en \
	        HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	        translate.submit; \
	     fi \
	  done \
	done


### NEWNEWNEW

translate-sami-xx-wiki:
	for s in se; do \
	  for t in sma smn sms smj fi no sv; do \
	      ${MAKE} SRC=$$s TRG=$$t \
		MULTI_TARGET_MODEL=1 \
	        MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
	        HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	        translate.submit; \
	  done \
	done


translate-sami-xx-corp: sami-corp giellatekno/se
	for s in se sma smn sms smj; do
	  for t in fi no sv; do \
	    if [ "$$s" != "$$t" ]; then \
	      ${MAKE} SRC=$$s TRG=$$t \
	      WIKI_DIR=giellatekno/$$s \
	      WIKISOURCE=corp \
	      MULTI_TARGET_MODEL=1 \
	      MODELHOME=${HOME}/research/Opus-MT-train/models/se+sma+smj+smn+sms-fi+nb+no+nn+ru+sv+en \
	      HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	      translate.submit; \
	    fi \
	  done \
	done

translate-xx-sami-wiki:
	for s in fi no nn sv; do \
	  for t in se sma smn sms smj; do \
	      ${MAKE} SRC=$$s TRG=$$t \
		MULTI_TARGET_MODEL=1 \
	        MODELHOME=${HOME}/research/Opus-MT-train/models/fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms \
	        HPC_MEM=4g HPC_CORES=1 WALLTIME=72 \
	        translate.submit; \
	  done \
	done


get-data: ${WIKI_JSON}
extract-text: ${WIKI_TXT}
extract-doc: ${WIKI_DOC}
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
prepare-data: ${WIKI_PRE}
translate: ${WIKI_LATEST_TRG}
	${MAKE} ${WIKI_LATEST_SRC}

## translate all parts
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
	for p in ${PARTS}; do \
	  ${MAKE} PART=$$p translate; \
	done

## create jobs for translating all parts
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
	for p in ${PARTS}; do \
	  ${MAKE} PART=$$p translate.submit; \
	done

print-names:
	echo ${LANGNAME}
	echo ${UDPIPE_MODEL}
	echo ${WIKI_JSON}
	echo ${MODELNAME}


## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?

${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
	mkdir -p ${dir $@}
	cp ${MODELZIP} ${dir $@}
	cd ${dir $@} && unzip *.zip
endif


## pre-process data
## ---> TODO: does that work for multilingual data that need prefix?

ifeq (${MULTI_TARGET_MODEL},1)
  PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
else
  PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif

${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${SRC}.${PART}.gz
ifneq (${MODELZIP},)
	mkdir -p ${dir $@}
	${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
	${GZCAT} $< |\
	grep -v '[<>{}]' |\
	${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
	perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
	head -${MAX_SENTENCES} |\
	gzip -c > $@
endif


## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)

${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
	mkdir -p ${dir $@}
	${GZCAT} $< |\
	sed 's/ //g;s/▁/ /g' | \
	sed 's/^ *//;s/ *$$//' |\
	sed 's/^>>[a-z]*<< //' |\
	gzip -c > $@
endif


## remove prefix from latest translation files

ALL_LATEST = ${wildcard */latest/*.gz}

fix-prefix:
	for d in ${ALL_LATEST}; do \
	  echo "fix $$d"; \
	  ${ZCAT} $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
	  cat $$d.fixed | gzip -c > $$d; \
	  rm -f $$d.fixed; \
	done


## overwrite the file with the latest translations
## --> this allows multiple translation iterations
##     without duplicating the data we want to use in MT training

${WIKI_LATEST_SRC}: ${WIKI_SRC}
	mkdir -p ${dir $@}
	cp $< $@

${WIKI_LATEST_TRG}: ${WIKI_TRG}
	mkdir -p ${dir $@}
	cp $< $@


## translate

%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},)
	mkdir -p ${dir $@}
	${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
	${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
		-i ${PWD}/$< \
		-c decoder.yml \
		-d ${MARIAN_GPUS} \
		${MARIAN_DECODER_FLAGS} |\
	sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
	gzip -c > ${PWD}/$@
#ifneq (${LANGPAIR},)
#ifneq (${MODELNAME},)
#	rm -fr ${LANGPAIR}/${MODELNAME}
#endif
#endif
endif


## index of all downloadable files
index.html:
	${WGET} -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current

## wiki in json format
${WIKI_JSON}:
	${WGET} -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}


## languages with nonbreaking prefix files
## (i.e. support for the Moses sentence splitter)
# ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta yue zh

# MOSES_LANGS = ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta yue zh
MOSES_LANGS = ca cs de el en es fi fr ga hu is it lt lv nl pl pt ro ru sk sl sv ta

## check whether there is a UDPipe model
## and LANGID is not supported by moses tools (they are much faster!)

ifneq (${UDPIPE_MODEL},)
ifneq ($(filter-out ${MOSES_LANGS},${LANGID}),)
  SENTSPLITTER = udpipe --input=horizontal --tokenize \
		${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
		grep '^\# *text *= *\|\# newpar' |\
		sed 's/^\# *text *= *//'
endif
endif

## fallback = moses tools
SENTSPLITTER ?= sed 's/^ *$$/\# newpar/' | \
		${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID} |\
		sed -e "s/\# newpar/\n\# newpar\n/g"


## extract sentences and normalize
## - requires jq, udpipe, and moses-scripts
${WIKI_TXT}: ${WIKI_JSON}
	mkdir -p ${dir $@}
	${LOAD_MODULES} \
	${GZCAT} $< | ${JQ} -r '.text' | \
	grep -v 'null' |\
	grep -v '[<>{}]' |\
	${SENTSPLITTER} |\
	$(TOKENIZER)/replace-unicode-punctuation.perl |\
	$(TOKENIZER)/remove-non-printing-char.perl |\
	sed 's/  */ /g;s/^ *//g;s/ *$$//g' |\
	python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
	${SORT} -u | ${SHUFFLE} |\
	split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
	gzip -f ${patsubst %${PART}.gz,%,$@}*


#	$(TOKENIZER)/normalize-punctuation.perl |\


${WIKI_DOC}: ${WIKI_JSON}
	mkdir -p ${dir $@}
	${LOAD_MODULES} \
	${GZCAT} $< | ${JQ} -r '.text' | \
	sed 's/^ *null *$$//' |\
	grep -v '[<>{}]' |\
	${SENTSPLITTER} |\
	$(TOKENIZER)/replace-unicode-punctuation.perl |\
	$(TOKENIZER)/remove-non-printing-char.perl |\
	sed 's/  */ /g;s/^ *//g;s/ *$$//g' |\
	sed 's/^# newpar$$//' |\
	python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
	gzip -c > $@


check-length:
	for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
	  s=`echo $$d | cut -f1 -d'-'`; \
	  t=`echo $$d | cut -f2 -d'-'`; \
	  echo "check $$d"; \
	  for S in `ls $$d/*.$$s.gz`; do \
	    T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
	    echo "$$S -- $$T"; \
	    ${GZCAT} $$S | wc -l; \
	    ${GZCAT} $$T | wc -l; \
	    if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
	      echo "$$S != $$T"; \
	    fi \
	  done \
	done


## OLD: without splitting into parts
#
# ${WIKI_TXT}: ${WIKI_JSON}
# 	${LOAD_MODULES} \
# 	${ZCAT} $< | ${JQ} -r '.text' | \
# 	grep -v 'null' |\
# 	${SENTSPLITTER} |\
# 	$(TOKENIZER)/replace-unicode-punctuation.perl |\
# 	$(TOKENIZER)/remove-non-printing-char.perl |\
# 	$(TOKENIZER)/normalize-punctuation.perl |\
# 	sed 's/  */ /g;s/^ *//g;s/ *$$//g' |\
# 	gzip -c > $@


# afrikaans-afribooms-ud-2.4-190531.udpipe	af
# ancient_greek-perseus-ud-2.4-190531.udpipe
# ancient_greek-proiel-ud-2.4-190531.udpipe
# arabic-padt-ud-2.4-190531.udpipe	ar
# armenian-armtdp-ud-2.4-190531.udpipe
# basque-bdt-ud-2.4-190531.udpipe		eo
# belarusian-hse-ud-2.4-190531.udpipe
# bulgarian-btb-ud-2.4-190531.udpipe		bg
# catalan-ancora-ud-2.4-190531.udpipe		ca
# chinese-gsd-ud-2.4-190531.udpipe		zh
# classical_chinese-kyoto-ud-2.4-190531.udpipe	zh_tw
# coptic-scriptorium-ud-2.4-190531.udpipe
# croatian-set-ud-2.4-190531.udpipe		hr
# czech-cac-ud-2.4-190531.udpipe		cs
# czech-cltt-ud-2.4-190531.udpipe		cs
# czech-fictree-ud-2.4-190531.udpipe		cs
# czech-pdt-ud-2.4-190531.udpipe		cs
# danish-ddt-ud-2.4-190531.udpipe		da
# dutch-alpino-ud-2.4-190531.udpipe		nl
# dutch-lassysmall-ud-2.4-190531.udpipe		nl
# english-ewt-ud-2.4-190531.udpipe		en
# english-gum-ud-2.4-190531.udpipe		en
# english-lines-ud-2.4-190531.udpipe		en
# english-partut-ud-2.4-190531.udpipe		en
# estonian-edt-ud-2.4-190531.udpipe		et
# estonian-ewt-ud-2.4-190531.udpipe		et
# finnish-ftb-ud-2.4-190531.udpipe		fi
# finnish-tdt-ud-2.4-190531.udpipe		fi
# french-gsd-ud-2.4-190531.udpipe		fr
# french-partut-ud-2.4-190531.udpipe		fr
# french-sequoia-ud-2.4-190531.udpipe		fr
# french-spoken-ud-2.4-190531.udpipe		fr
# galician-ctg-ud-2.4-190531.udpipe		gl
# galician-treegal-ud-2.4-190531.udpipe		gl
# german-gsd-ud-2.4-190531.udpipe		de
# gothic-proiel-ud-2.4-190531.udpipe
# greek-gdt-ud-2.4-190531.udpipe		el
# hebrew-htb-ud-2.4-190531.udpipe		he
# hindi-hdtb-ud-2.4-190531.udpipe		hi
# hungarian-szeged-ud-2.4-190531.udpipe		hu
# indonesian-gsd-ud-2.4-190531.udpipe		id
# irish-idt-ud-2.4-190531.udpipe		cy
# italian-isdt-ud-2.4-190531.udpipe		it
# italian-partut-ud-2.4-190531.udpipe		it
# italian-postwita-ud-2.4-190531.udpipe		it
# italian-vit-ud-2.4-190531.udpipe		it
# japanese-gsd-ud-2.4-190531.udpipe		ja
# korean-gsd-ud-2.4-190531.udpipe		ko
# korean-kaist-ud-2.4-190531.udpipe		ko
# latin-ittb-ud-2.4-190531.udpipe		la
# latin-perseus-ud-2.4-190531.udpipe		la
# latin-proiel-ud-2.4-190531.udpipe		la
# latvian-lvtb-ud-2.4-190531.udpipe		lv
# lithuanian-alksnis-ud-2.4-190531.udpipe	lt
# lithuanian-hse-ud-2.4-190531.udpipe		lt
# maltese-mudt-ud-2.4-190531.udpipe		mt
# marathi-ufal-ud-2.4-190531.udpipe
# north_sami-giella-ud-2.4-190531.udpipe
# norwegian-bokmaal-ud-2.4-190531.udpipe	nb
# norwegian-nynorsklia-ud-2.4-190531.udpipe	nn
# norwegian-nynorsk-ud-2.4-190531.udpipe	nn
# old_church_slavonic-proiel-ud-2.4-190531.udpipe
# old_french-srcmf-ud-2.4-190531.udpipe
# old_russian-torot-ud-2.4-190531.udpipe
# persian-seraji-ud-2.4-190531.udpipe		fa
# polish-lfg-ud-2.4-190531.udpipe		pl
# polish-pdb-ud-2.4-190531.udpipe		pl
# portuguese-bosque-ud-2.4-190531.udpipe	pt
# portuguese-gsd-ud-2.4-190531.udpipe		pt
# romanian-nonstandard-ud-2.4-190531.udpipe	ro
# romanian-rrt-ud-2.4-190531.udpipe		ro
# russian-gsd-ud-2.4-190531.udpipe		ru
# russian-syntagrus-ud-2.4-190531.udpipe	ru
# russian-taiga-ud-2.4-190531.udpipe		ru
# serbian-set-ud-2.4-190531.udpipe		sr
# slovak-snk-ud-2.4-190531.udpipe		sk
# slovenian-ssj-ud-2.4-190531.udpipe		sl
# slovenian-sst-ud-2.4-190531.udpipe		sl
# spanish-ancora-ud-2.4-190531.udpipe		es
# spanish-gsd-ud-2.4-190531.udpipe		es
# swedish-lines-ud-2.4-190531.udpipe		sv
# swedish-talbanken-ud-2.4-190531.udpipe	sv
# tamil-ttb-ud-2.4-190531.udpipe
# telugu-mtg-ud-2.4-190531.udpipe
# turkish-imst-ud-2.4-190531.udpipe		tr
# ukrainian-iu-ud-2.4-190531.udpipe
# urdu-udtb-ud-2.4-190531.udpipe
# uyghur-udt-ud-2.4-190531.udpipe
# vietnamese-vtb-ud-2.4-190531.udpipe		vi
# wolof-wtb-ud-2.4-190531.udpipe


# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-general.json.gz

# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-general.json.gz

# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-general.json.gz

# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-content.json.gz
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-general.json.gz


# enwiki-20191209-cirrussearch-content.json.gz       10-Dec-2019 11:04         22622822308
# enwiki-20191209-cirrussearch-content.json.gz.tmp   10-Dec-2019 10:57         21460369408
# enwiki-20191209-cirrussearch-general.json.gz       10-Dec-2019 16:22         50406322974
# enwiki-20191209-cirrussearch-general.json.gz.tmp   10-Dec-2019 15:50         44720914432
# enwikibooks-20191209-cirrussearch-content.json.gz  10-Dec-2019 16:24           319454731
# enwikibooks-20191209-cirrussearch-general.json.gz  10-Dec-2019 16:25            97206925
# enwikinews-20191209-cirrussearch-content.json.gz   10-Dec-2019 16:26            53746769
# enwikinews-20191209-cirrussearch-general.json.gz   10-Dec-2019 16:36           364098656
# enwikiquote-20191209-cirrussearch-content.json.gz  10-Dec-2019 16:38           234637326
# enwikiquote-20191209-cirrussearch-general.json.gz  10-Dec-2019 16:38            66848855
# enwikisource-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:09          5236203374
# enwikisource-20191209-cirrussearch-content.json..> 10-Dec-2019 17:06          4597481472
# enwikisource-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:11           152492247
# enwikiversity-20191209-cirrussearch-content.jso..> 10-Dec-2019 17:12           145288148
# enwikiversity-20191209-cirrussearch-general.jso..> 10-Dec-2019 17:13           193051475
# enwikivoyage-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:14           179134384
# enwikivoyage-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:15            99357806
# enwiktionary-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:36          2319801836
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23           918503424
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42           848846623
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40           661585920