mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-08-16 15:50:30 +03:00
enabled fetching OPUS data instead of reading local files if necessary
This commit is contained in:
parent
94eeec13eb
commit
e31550a3ad
5
Makefile
5
Makefile
@ -197,6 +197,7 @@ all: ${WORKDIR}/config.mk
|
||||
# select project_2002688 (OPUS-MT)
|
||||
# - "make store" overrides
|
||||
# - "make fetch" does not override (delete dir first)
|
||||
# - storing data will resolve symbolic links
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
.PHONY: store store-data fetch fetch-data
|
||||
@ -208,7 +209,7 @@ WORK_CONTAINER ?= OPUS-MT_${notdir ${WORKHOME}}-${WHOAMI}
|
||||
|
||||
## store workdir on allas
|
||||
store:
|
||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --override ${LANGPAIRSTR}
|
||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --follow-links --override ${LANGPAIRSTR}
|
||||
|
||||
## fetch workdir from allas
|
||||
fetch:
|
||||
@ -218,7 +219,7 @@ fetch:
|
||||
|
||||
## store and fetch data dir (raw data files)
|
||||
store-data:
|
||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --override data
|
||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --follow-links --override data
|
||||
|
||||
fetch-data:
|
||||
mkdir -p ${WORK_DESTDIR}
|
||||
|
@ -161,6 +161,7 @@ wiki-iso639-doc:
|
||||
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
||||
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
|
||||
|
||||
.PHONY: translate-all-wikis
|
||||
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
for w in ${WIKISOURCES}; do \
|
||||
${MAKE} WIKISOURCE=$$w extract-text; \
|
||||
@ -171,6 +172,7 @@ translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
fi \
|
||||
done
|
||||
|
||||
.PHONY: translate-all-wikiparts
|
||||
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
for w in ${WIKISOURCES}; do \
|
||||
${MAKE} WIKISOURCE=$$w extract-text; \
|
||||
|
@ -1,5 +1,104 @@
|
||||
# Back-translation
|
||||
|
||||
# Translate data as synthetic training data
|
||||
Translate monolingual data (extracted from various wikimedia sources) to create synthetic training data.
|
||||
|
||||
|
||||
## Overview
|
||||
|
||||
Relevant makefiles:
|
||||
|
||||
* [Makefile](Makefile)
|
||||
* [lib/config.mk](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/lib/config.mk)
|
||||
|
||||
Main recipes:
|
||||
|
||||
* `all`: translate wiki data for the specified language
|
||||
* `get-data`:
|
||||
* `extract-text`:
|
||||
* `extract-doc`:
|
||||
* `prepare-model`:
|
||||
* `prepare-data`:
|
||||
* `translate`:
|
||||
* `check-length`:
|
||||
* `print-names`:
|
||||
* `print-modelname`:
|
||||
|
||||
|
||||
Recipes for fetching data and pre-processing batch jobs:
|
||||
|
||||
* `index.html`:
|
||||
* `all-wikitext`:
|
||||
* `all-wikilangs`:
|
||||
* `all-wikilangs-fast`:
|
||||
* `all-wikis-all-langs`:
|
||||
* `all-wikidocs-all-langs`:
|
||||
* `wiki-iso639`: link (shuffled) wikisources to iso639-3 conform language labels
|
||||
* `wiki-iso639-doc`: same as above but for non-shuffled wikisources with document boundaries
|
||||
|
||||
|
||||
Recipes for translating wiki data:
|
||||
|
||||
* `translate-all-parts`:
|
||||
* `translate-all-wikis`:
|
||||
* `translate-all-wikiparts`:
|
||||
* `translate-all-parts-jobs`:
|
||||
* `translate-all-wikis-jobs`:
|
||||
* `translate-all-wikiparts-jobs`:
|
||||
|
||||
|
||||
Recipes for Sami languages:
|
||||
|
||||
* `sami-corp`:
|
||||
* `translate-sami`:
|
||||
* `translate-sami-corp`:
|
||||
* `translate-sami-wiki`:
|
||||
* `translate-sami-xx-wiki`:
|
||||
* `translate-sami-xx-corp`:
|
||||
* `translate-xx-sami-wiki`:
|
||||
|
||||
|
||||
Recipes for Celtic languages:
|
||||
|
||||
* `fetch-celtic`:
|
||||
* `translate-celtic-english`:
|
||||
* `translate-english-celtic`:
|
||||
* `breton`:
|
||||
|
||||
|
||||
Recipes for Nordic and Uralic languages:
|
||||
|
||||
* `finland-focus-wikis`:
|
||||
* `translate-thl`:
|
||||
* `all-nordic-wikidocs`:
|
||||
* `uralic-wiki-texts`:
|
||||
* `uralic-wikis`:
|
||||
|
||||
|
||||
Other task-specific recipes:
|
||||
|
||||
* `xnli-wikidocs`:
|
||||
* `small-romance`:
|
||||
* `wikimedia-focus-wikis`:
|
||||
|
||||
|
||||
|
||||
Parameters / variables:
|
||||
|
||||
* `SRC`:
|
||||
* `TRG`:
|
||||
* `WIKISOURCE`:
|
||||
* `SPLIT_SIZE`:
|
||||
* `MAX_LENGTH`:
|
||||
* `MAX_SENTENCES`:
|
||||
* `PART`:
|
||||
* `MODELSDIR`:
|
||||
* `MULTI_TARGET_MODEL`:
|
||||
* `WIKI_HOME`:
|
||||
* `WIKIDOC_HOME`:
|
||||
|
||||
|
||||
|
||||
## Detailed information
|
||||
|
||||
Use Wiki data:
|
||||
|
||||
|
@ -5,10 +5,10 @@ The build targets are all included in various makefiles and the main idea is to
|
||||
|
||||
The package includes 4 components:
|
||||
|
||||
* [basic training](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/Makefile) of bilingual and multilingual models
|
||||
* [back-translation](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/Makefile) for data augmentation
|
||||
* [fine-tuning](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/Makefile) for domain adaptation
|
||||
* [pivoting](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/Makefile) for data augmentation
|
||||
* basic training of bilingual and multilingual models ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/Makefile))
|
||||
* [Generating back-translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/README.md) for data augmentation ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/Makefile))
|
||||
* [Fine-tuning models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/README.md) for domain adaptation ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/Makefile))
|
||||
* [Generate pivot-language-based translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/README.md) for data augmentation ([pivoting](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/Makefile))
|
||||
|
||||
|
||||
More information about specific tasks:
|
||||
@ -17,11 +17,8 @@ More information about specific tasks:
|
||||
* [Training models](Train.md)
|
||||
* [Testing models](Test.md)
|
||||
* [Running batch jobs](BatchJobs.md)
|
||||
* [Generating back-translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/README.md)
|
||||
* [Fine-tuning models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/README.md)
|
||||
* [Generate pivot-language-based translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/README.md)
|
||||
* [Models for the Tatoeba MT Challenge](TatoebaChallenge.md)
|
||||
* [Packaging, releases and storage](ReleaseAndStore.md)
|
||||
* [Models for the Tatoeba MT Challenge](TatoebaChallenge.md)
|
||||
|
||||
|
||||
Tutorials (to-do)
|
||||
|
17
lib/data.mk
17
lib/data.mk
@ -250,19 +250,18 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
rm -f ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
|
||||
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
||||
echo "extract $$c (${LANGPAIR}) from XML in local OPUS copy"; \
|
||||
opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} \
|
||||
-dl ${dir $@} -d $$c -s ${SRC} -t ${TRG} \
|
||||
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
||||
echo "fetch $$c (${LANGPAIR}) from OPUS"; \
|
||||
opus_read ${OPUSREAD_ARGS} -q -dl ${dir $@} -d $$c -s ${SRC} -t ${TRG} \
|
||||
opus_read ${OPUSREAD_ARGS} -ln -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} \
|
||||
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||
else \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
echo "!! skip $@"; \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
echo "fetch $$c (${LANGPAIR}) from OPUS"; \
|
||||
opus_read ${OPUSREAD_ARGS} -ln -q -dl ${TMPDIR} -d $$c -s ${SRC} -t ${TRG} \
|
||||
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||
fi )
|
||||
|
||||
# echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
# echo "!! skip $@"; \
|
||||
# echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
|
||||
|
||||
%.${TRGEXT}.raw: %.${SRCEXT}.raw
|
||||
@echo "done!"
|
||||
|
Loading…
Reference in New Issue
Block a user