From bb98f03df59dd8942e23bbb43a4cdcb5128eb53a Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Wed, 22 Jan 2020 13:33:28 +0200 Subject: [PATCH] backtranslate bugfix --- TODO.md | 14 ++++++++++++++ backtranslate/Makefile | 19 ++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/TODO.md b/TODO.md index 4f18d8ab..814aec25 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,22 @@ # Things to do + +## Backtranslation + +* status: basically working, need better integration?! * add backtranslations to training data * can use monolingual data from tokenized wikipedia dumps: https://sites.google.com/site/rmyeid/projects/polyglot * https://dumps.wikimedia.org/backup-index.html * better in JSON: https://dumps.wikimedia.org/other/cirrussearch/current/ +## Fine-tuning and domain adaptation + +* status: basically working +* do we want to publishfine-tuned data or rather the fina-tuning procedures? (using a docker container?) + + +## Show-case some selected language pairs + +* collaboration with wikimedia +* focus languages: Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon) diff --git a/backtranslate/Makefile b/backtranslate/Makefile index 6653551f..29d9a84e 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -28,6 +28,12 @@ MODELHOME = ../models/${LANGPAIR} MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} +ifeq (${MODELNAME},) + MODELHOME = ../work-spm/models/${LANGPAIR} + MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} + MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} +endif + LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \ module load nlpl-udpipe nlpl-opus && @@ -46,7 +52,6 @@ WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz ## don't delete translated text if the process crashes .PRECIOUS: ${WIKI_TRG} - ## find wiki downloads WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1) @@ -68,12 +73,14 @@ all: index.html ${MAKE} ${WIKI_SRC} ${WIKI_TRG} -WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource +WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary all-wikis: for w in ${WIKISOURCES}; do \ ${MAKE} WIKISOURCE=$$w prepare-data; \ - ${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit; \ + if [ `find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz' | wc -l` -gt 0 ]; then \ + ${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit; \ + fi \ done @@ -83,6 +90,12 @@ all-wikilangs: index.html done +# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon) +focus-wikis: + for l in tl bcl ml bn mn; do \ + ${MAKE} SRC=$$l TRG=en all-wikis; \ + done + extract-text: ${WIKI_TXT} prepare-model: ${LANGPAIR}/decoder.yml prepare-data: ${WIKI_PRE}