From fe16a0c4dd78c5ea7edca2451d7f818d09ec9529 Mon Sep 17 00:00:00 2001
From: Joerg Tiedemann <jorg.tiedemann@helsinki.fi>
Date: Sat, 11 Jan 2020 00:29:06 +0200
Subject: [PATCH] backtranslation scripts

---
 Makefile.data                        |   4 -
 {work-bt => backtranslate}/Makefile  | 116 ++++++++++++++++++++++++---
 {work-bt => backtranslate}/README.md |   0
 3 files changed, 103 insertions(+), 17 deletions(-)
 rename {work-bt => backtranslate}/Makefile (76%)
 rename {work-bt => backtranslate}/README.md (100%)

diff --git a/Makefile.data b/Makefile.data
index eee8d306..8211d416 100644
--- a/Makefile.data
+++ b/Makefile.data
@@ -703,7 +703,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new BPE model is created even though the data has changed!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif
 
 ## no labels on the target language side
@@ -716,7 +715,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new BPE codes are created!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif
 
 
@@ -795,7 +793,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new SPM model is created even though the data has changed!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif
 
 ## no labels on the target language side
@@ -813,7 +810,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new SPM model created!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif
 
 
diff --git a/work-bt/Makefile b/backtranslate/Makefile
similarity index 76%
rename from work-bt/Makefile
rename to backtranslate/Makefile
index c1b8d345..4c58dd93 100644
--- a/work-bt/Makefile
+++ b/backtranslate/Makefile
@@ -1,3 +1,26 @@
+#
+# backtranslate wiki data
+#
+# only works with sentencepiece models!
+#
+
+include ../Makefile.env
+include ../Makefile.config
+include ../Makefile.slurm
+
+SRC = af
+TRG = en
+
+## maximum input length (number sentence piece segments)
+MAX_LENGTH = 250
+
+
+LANGPAIR = ${SRC}-${TRG}
+
+
+MODELHOME = ../models/${LANGPAIR}
+MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*.zip}}}
+
 
 LOAD_MODULES = 	module use -a /projappl/nlpl/software/modules/etc/ && \
 		module load nlpl-udpipe nlpl-opus &&
@@ -6,24 +29,36 @@ WIKILANGS = 	${sort $(patsubst >%wiki-,%,${shell grep -o '>..wiki-' index.html})
 		${sort $(patsubst >%wiki-,%,${shell grep -o '>...wiki-' index.html})}
 
 
-LANGID   = af
+LANGID   = ${SRC}
 WIKI_TXT = wiki.${LANGID}.gz
+WIKI_SRC = wiki.${LANGPAIR}.${SRC}.gz
+WIKI_PRE = wiki.${LANGPAIR}.${SRC}.spm.gz
+WIKI_TRG = wiki.${LANGPAIR}.${TRG}.gz
 
+
+## find wiki downloads
+WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
+
+## find UDPipe model
 LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
 		cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
-UDPIPE_MODEL = ${notdir $(shell find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
-WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
+UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
+
 
 
 all: index.html
+	${MAKE} ${WIKI_SRC} ${WIKI_TRG}
+
+all-wikis: index.html
 	for l in ${WIKILANGS}; do \
 	  ${MAKE} LANGID=$$l wiki-txt; \
 	done
 
-wiki-txt:
-	if [ "${UDPIPE_MODEL}" != "" ]; then \
-	  ${MAKE} ${WIKI_TXT}; \
-	fi
+
+wiki-txt: ${WIKI_TXT}
+prepare-model: ${LANGPAIR}/decoder.yml
+prepare-data: ${WIKI_PRE}
+translate: ${WIKI_SRC} ${WIKI_TRG}
 
 print-names:
 	echo ${LANGNAME}
@@ -31,22 +66,77 @@ print-names:
 	echo ${WIKI_JSON}
 
 
+
+${LANGPAIR}/decoder.yml:
+ifneq (${MODELZIP},)
+	mkdir -p ${dir $@}
+	cp ${MODELZIP} ${dir $@}
+	cd ${dir $@} && unzip *.zip
+endif
+
+
+%.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
+ifneq (${MODELZIP},)
+	${MAKE} ${LANGPAIR}/decoder.yml
+	zcat $< |\
+	${LANGPAIR}/preprocess.sh af ${LANGPAIR}/source.spm |\
+	perl -pe 'print if (split(/\s+/)>${MAX_LENGTH});' |\
+	gzip -c > $@
+endif
+
+${WIKI_SRC}: ${WIKI_PRE}
+ifneq (${MODELZIP},)
+	zcat $< |\
+	sed 's/ //g;s/▁/ /g' | \
+	sed 's/^ *//;s/ *$$//' |\
+	gzip -c > $@
+endif
+
+%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
+ifneq (${MODELZIP},)
+	${MAKE} ${LANGPAIR}/decoder.yml
+	${LOADMODS} && cd ${LANGPAIR} && ${MARIAN}/marian-decoder \
+		-i ${PWD}/$< \
+		-c decoder.yml \
+		-d ${MARIAN_GPUS} \
+		${MARIAN_DECODER_FLAGS} |\
+	sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
+	gzip -c > ${PWD}/$@
+endif
+
+
+## index of all downloadable files
+index.html:
+	wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
+
+## wiki in json format
 ${WIKI_JSON}:
 	wget https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
 
 
+ifneq (${UDPIPE_MODEL},)
+  SENTSPLITTER = udpipe --input=horizontal --tokenize \
+		${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
+		grep '^\# *text *= *' |\
+		sed 's/^\# *text *= *//'
+else
+  SENTSPLITTER = ${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID}
+endif
+
+## extract sentences and normalize
+## - requires jq, udpipe, and moses-scripts
 ${WIKI_TXT}: ${WIKI_JSON}
 	${LOAD_MODULES} \
 	zcat $< | jq -r '.text' | \
 	grep -v 'null' |\
-	udpipe --input=horizontal --tokenize \
-	${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
-	grep '^\# *text *= *' |\
-	sed 's/^\# *text *= *//' |\
+	${SENTSPLITTER} |\
+	$(TOKENIZER)/replace-unicode-punctuation.perl |\
+	$(TOKENIZER)/remove-non-printing-char.perl |\
+	$(TOKENIZER)/normalize-punctuation.perl |\
+	sed 's/  */ /g;s/^ *//g;s/ *$$//g' |\
 	gzip -c > $@
 
-index.html:
-	wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
+
 
 
 
diff --git a/work-bt/README.md b/backtranslate/README.md
similarity index 100%
rename from work-bt/README.md
rename to backtranslate/README.md