backtranslation scripts

2025-01-06 01:37:00 +03:00 · 2020-01-11 00:29:06 +02:00 · 2020-01-11 00:29:06 +02:00 · fe16a0c4dd
commit fe16a0c4dd
parent 1178dadf8d
3 changed files with 103 additions and 17 deletions
--- a/Makefile.data
+++ b/Makefile.data
@ -703,7 +703,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new BPE model is created even though the data has changed!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif

 ## no labels on the target language side
@ -716,7 +715,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new BPE codes are created!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif


@ -795,7 +793,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new SPM model is created even though the data has changed!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif

 ## no labels on the target language side
@ -813,7 +810,6 @@ else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new SPM model created!"
 	@echo "WARNING! Delete the file if you want to start from scratch!"
-	touch $@
 endif


--- a/backtranslate/Makefile
+++ b/backtranslate/Makefile
@ -1,3 +1,26 @@
+#
+# backtranslate wiki data
+#
+# only works with sentencepiece models!
+#
+
+include ../Makefile.env
+include ../Makefile.config
+include ../Makefile.slurm
+
+SRC = af
+TRG = en
+
+## maximum input length (number sentence piece segments)
+MAX_LENGTH = 250
+
+
+LANGPAIR = ${SRC}-${TRG}
+
+
+MODELHOME = ../models/${LANGPAIR}
+MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*.zip}}}
+

 LOAD_MODULES = 	module use -a /projappl/nlpl/software/modules/etc/ && \
 		module load nlpl-udpipe nlpl-opus &&
@ -6,24 +29,36 @@ WIKILANGS = 	${sort $(patsubst >%wiki-,%,${shell grep -o '>..wiki-' index.html})
 		${sort $(patsubst >%wiki-,%,${shell grep -o '>...wiki-' index.html})}


-LANGID   = af
+LANGID   = ${SRC}
 WIKI_TXT = wiki.${LANGID}.gz
+WIKI_SRC = wiki.${LANGPAIR}.${SRC}.gz
+WIKI_PRE = wiki.${LANGPAIR}.${SRC}.spm.gz
+WIKI_TRG = wiki.${LANGPAIR}.${TRG}.gz

+
+## find wiki downloads
+WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
+
+## find UDPipe model
 LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
 		cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
-UDPIPE_MODEL = ${notdir $(shell find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
-WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
+UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
+


 all: index.html
+	${MAKE} ${WIKI_SRC} ${WIKI_TRG}
+
+all-wikis: index.html
 	for l in ${WIKILANGS}; do \
 	  ${MAKE} LANGID=$$l wiki-txt; \
 	done

-wiki-txt:
-	if [ "${UDPIPE_MODEL}" != "" ]; then \
-	  ${MAKE} ${WIKI_TXT}; \
-	fi
+
+wiki-txt: ${WIKI_TXT}
+prepare-model: ${LANGPAIR}/decoder.yml
+prepare-data: ${WIKI_PRE}
+translate: ${WIKI_SRC} ${WIKI_TRG}

 print-names:
 	echo ${LANGNAME}
@ -31,22 +66,77 @@ print-names:
 	echo ${WIKI_JSON}


+
+${LANGPAIR}/decoder.yml:
+ifneq (${MODELZIP},)
+	mkdir -p ${dir $@}
+	cp ${MODELZIP} ${dir $@}
+	cd ${dir $@} && unzip *.zip
+endif
+
+
+%.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
+ifneq (${MODELZIP},)
+	${MAKE} ${LANGPAIR}/decoder.yml
+	zcat $< |\
+	${LANGPAIR}/preprocess.sh af ${LANGPAIR}/source.spm |\
+	perl -pe 'print if (split(/\s+/)>${MAX_LENGTH});' |\
+	gzip -c > $@
+endif
+
+${WIKI_SRC}: ${WIKI_PRE}
+ifneq (${MODELZIP},)
+	zcat $< |\
+	sed 's/ //g;s/▁/ /g' | \
+	sed 's/^ *//;s/ *$$//' |\
+	gzip -c > $@
+endif
+
+%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
+ifneq (${MODELZIP},)
+	${MAKE} ${LANGPAIR}/decoder.yml
+	${LOADMODS} && cd ${LANGPAIR} && ${MARIAN}/marian-decoder \
+		-i ${PWD}/$< \
+		-c decoder.yml \
+		-d ${MARIAN_GPUS} \
+		${MARIAN_DECODER_FLAGS} |\
+	sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
+	gzip -c > ${PWD}/$@
+endif
+
+
+## index of all downloadable files
+index.html:
+	wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
+
+## wiki in json format
 ${WIKI_JSON}:
 	wget https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}


+ifneq (${UDPIPE_MODEL},)
+  SENTSPLITTER = udpipe --input=horizontal --tokenize \
+		${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
+		grep '^\# *text *= *' |\
+		sed 's/^\# *text *= *//'
+else
+  SENTSPLITTER = ${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID}
+endif
+
+## extract sentences and normalize
+## - requires jq, udpipe, and moses-scripts
 ${WIKI_TXT}: ${WIKI_JSON}
 	${LOAD_MODULES} \
 	zcat $< | jq -r '.text' | \
 	grep -v 'null' |\
-	udpipe --input=horizontal --tokenize \
-	${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
-	grep '^\# *text *= *' |\
-	sed 's/^\# *text *= *//' |\
+	${SENTSPLITTER} |\
+	$(TOKENIZER)/replace-unicode-punctuation.perl |\
+	$(TOKENIZER)/remove-non-printing-char.perl |\
+	$(TOKENIZER)/normalize-punctuation.perl |\
+	sed 's/  */ /g;s/^ *//g;s/ *$$//g' |\
 	gzip -c > $@

-index.html:
-	wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
+



--- a/backtranslate/README.md
+++ b/backtranslate/README.md