backtranslate bugfix

This commit is contained in:
Joerg Tiedemann 2020-01-22 13:33:28 +02:00
parent a0d8140cf2
commit bb98f03df5
2 changed files with 30 additions and 3 deletions

14
TODO.md
View File

@ -1,8 +1,22 @@
# Things to do
## Backtranslation
* status: basically working, need better integration?!
* add backtranslations to training data
* can use monolingual data from tokenized wikipedia dumps: https://sites.google.com/site/rmyeid/projects/polyglot
* https://dumps.wikimedia.org/backup-index.html
* better in JSON: https://dumps.wikimedia.org/other/cirrussearch/current/
## Fine-tuning and domain adaptation
* status: basically working
* do we want to publishfine-tuned data or rather the fina-tuning procedures? (using a docker container?)
## Show-case some selected language pairs
* collaboration with wikimedia
* focus languages: Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)

View File

@ -28,6 +28,12 @@ MODELHOME = ../models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-spm/models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
@ -46,7 +52,6 @@ WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
## find wiki downloads
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
@ -68,12 +73,14 @@ all: index.html
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
all-wikis:
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w prepare-data; \
${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit; \
if [ `find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz' | wc -l` -gt 0 ]; then \
${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit; \
fi \
done
@ -83,6 +90,12 @@ all-wikilangs: index.html
done
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
focus-wikis:
for l in tl bcl ml bn mn; do \
${MAKE} SRC=$$l TRG=en all-wikis; \
done
extract-text: ${WIKI_TXT}
prepare-model: ${LANGPAIR}/decoder.yml
prepare-data: ${WIKI_PRE}