From e31550a3ad160ea7d22f924c05205247413c7b8d Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Fri, 28 Aug 2020 10:53:11 +0300 Subject: [PATCH] enabled fetching OPUS data instead of reading local files if necessary --- Makefile | 5 +- backtranslate/Makefile | 2 + backtranslate/README.md | 101 +++++++++++++++++++++++++++++++++++++++- doc/README.md | 13 ++---- lib/data.mk | 17 ++++--- 5 files changed, 118 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 38f7a104..138c41e6 100644 --- a/Makefile +++ b/Makefile @@ -197,6 +197,7 @@ all: ${WORKDIR}/config.mk # select project_2002688 (OPUS-MT) # - "make store" overrides # - "make fetch" does not override (delete dir first) +# - storing data will resolve symbolic links #--------------------------------------------------------------------- .PHONY: store store-data fetch fetch-data @@ -208,7 +209,7 @@ WORK_CONTAINER ?= OPUS-MT_${notdir ${WORKHOME}}-${WHOAMI} ## store workdir on allas store: - cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --override ${LANGPAIRSTR} + cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --follow-links --override ${LANGPAIRSTR} ## fetch workdir from allas fetch: @@ -218,7 +219,7 @@ fetch: ## store and fetch data dir (raw data files) store-data: - cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --override data + cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --follow-links --override data fetch-data: mkdir -p ${WORK_DESTDIR} diff --git a/backtranslate/Makefile b/backtranslate/Makefile index fc6dda75..086191e5 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -161,6 +161,7 @@ wiki-iso639-doc: # WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource +.PHONY: translate-all-wikis translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml for w in ${WIKISOURCES}; do \ ${MAKE} WIKISOURCE=$$w extract-text; \ @@ -171,6 +172,7 @@ translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml fi \ done +.PHONY: translate-all-wikiparts translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml for w in ${WIKISOURCES}; do \ ${MAKE} WIKISOURCE=$$w extract-text; \ diff --git a/backtranslate/README.md b/backtranslate/README.md index f13bafc9..f0cf8aef 100644 --- a/backtranslate/README.md +++ b/backtranslate/README.md @@ -1,5 +1,104 @@ +# Back-translation -# Translate data as synthetic training data +Translate monolingual data (extracted from various wikimedia sources) to create synthetic training data. + + +## Overview + +Relevant makefiles: + +* [Makefile](Makefile) +* [lib/config.mk](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/lib/config.mk) + +Main recipes: + +* `all`: translate wiki data for the specified language +* `get-data`: +* `extract-text`: +* `extract-doc`: +* `prepare-model`: +* `prepare-data`: +* `translate`: +* `check-length`: +* `print-names`: +* `print-modelname`: + + +Recipes for fetching data and pre-processing batch jobs: + +* `index.html`: +* `all-wikitext`: +* `all-wikilangs`: +* `all-wikilangs-fast`: +* `all-wikis-all-langs`: +* `all-wikidocs-all-langs`: +* `wiki-iso639`: link (shuffled) wikisources to iso639-3 conform language labels +* `wiki-iso639-doc`: same as above but for non-shuffled wikisources with document boundaries + + +Recipes for translating wiki data: + +* `translate-all-parts`: +* `translate-all-wikis`: +* `translate-all-wikiparts`: +* `translate-all-parts-jobs`: +* `translate-all-wikis-jobs`: +* `translate-all-wikiparts-jobs`: + + +Recipes for Sami languages: + +* `sami-corp`: +* `translate-sami`: +* `translate-sami-corp`: +* `translate-sami-wiki`: +* `translate-sami-xx-wiki`: +* `translate-sami-xx-corp`: +* `translate-xx-sami-wiki`: + + +Recipes for Celtic languages: + +* `fetch-celtic`: +* `translate-celtic-english`: +* `translate-english-celtic`: +* `breton`: + + +Recipes for Nordic and Uralic languages: + +* `finland-focus-wikis`: +* `translate-thl`: +* `all-nordic-wikidocs`: +* `uralic-wiki-texts`: +* `uralic-wikis`: + + +Other task-specific recipes: + +* `xnli-wikidocs`: +* `small-romance`: +* `wikimedia-focus-wikis`: + + + +Parameters / variables: + +* `SRC`: +* `TRG`: +* `WIKISOURCE`: +* `SPLIT_SIZE`: +* `MAX_LENGTH`: +* `MAX_SENTENCES`: +* `PART`: +* `MODELSDIR`: +* `MULTI_TARGET_MODEL`: +* `WIKI_HOME`: +* `WIKIDOC_HOME`: + + + +## Detailed information Use Wiki data: diff --git a/doc/README.md b/doc/README.md index d3e66a2d..3d1b1981 100644 --- a/doc/README.md +++ b/doc/README.md @@ -5,10 +5,10 @@ The build targets are all included in various makefiles and the main idea is to The package includes 4 components: -* [basic training](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/Makefile) of bilingual and multilingual models -* [back-translation](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/Makefile) for data augmentation -* [fine-tuning](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/Makefile) for domain adaptation -* [pivoting](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/Makefile) for data augmentation +* basic training of bilingual and multilingual models ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/Makefile)) +* [Generating back-translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/README.md) for data augmentation ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/Makefile)) +* [Fine-tuning models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/README.md) for domain adaptation ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/Makefile)) +* [Generate pivot-language-based translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/README.md) for data augmentation ([pivoting](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/Makefile)) More information about specific tasks: @@ -17,11 +17,8 @@ More information about specific tasks: * [Training models](Train.md) * [Testing models](Test.md) * [Running batch jobs](BatchJobs.md) -* [Generating back-translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/README.md) -* [Fine-tuning models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/README.md) -* [Generate pivot-language-based translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/README.md) +* [Packaging, releases and storage](ReleaseAndStore.md) * [Models for the Tatoeba MT Challenge](TatoebaChallenge.md) -* [Packaging, releases and storage](ReleaseAndStore.md) Tutorials (to-do) diff --git a/lib/data.mk b/lib/data.mk index e7c0f740..f432c242 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -250,19 +250,18 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ rm -f ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \ elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \ echo "extract $$c (${LANGPAIR}) from XML in local OPUS copy"; \ - opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} \ - -dl ${dir $@} -d $$c -s ${SRC} -t ${TRG} \ - -wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \ - elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \ - echo "fetch $$c (${LANGPAIR}) from OPUS"; \ - opus_read ${OPUSREAD_ARGS} -q -dl ${dir $@} -d $$c -s ${SRC} -t ${TRG} \ + opus_read ${OPUSREAD_ARGS} -ln -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} \ -wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \ else \ - echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \ - echo "!! skip $@"; \ - echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \ + echo "fetch $$c (${LANGPAIR}) from OPUS"; \ + opus_read ${OPUSREAD_ARGS} -ln -q -dl ${TMPDIR} -d $$c -s ${SRC} -t ${TRG} \ + -wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \ fi ) +# echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \ +# echo "!! skip $@"; \ +# echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \ + %.${TRGEXT}.raw: %.${SRCEXT}.raw @echo "done!"