mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 12:25:37 +03:00
enabled fetching OPUS data instead of reading local files if necessary
This commit is contained in:
parent
94eeec13eb
commit
e31550a3ad
5
Makefile
5
Makefile
@ -197,6 +197,7 @@ all: ${WORKDIR}/config.mk
|
|||||||
# select project_2002688 (OPUS-MT)
|
# select project_2002688 (OPUS-MT)
|
||||||
# - "make store" overrides
|
# - "make store" overrides
|
||||||
# - "make fetch" does not override (delete dir first)
|
# - "make fetch" does not override (delete dir first)
|
||||||
|
# - storing data will resolve symbolic links
|
||||||
#---------------------------------------------------------------------
|
#---------------------------------------------------------------------
|
||||||
|
|
||||||
.PHONY: store store-data fetch fetch-data
|
.PHONY: store store-data fetch fetch-data
|
||||||
@ -208,7 +209,7 @@ WORK_CONTAINER ?= OPUS-MT_${notdir ${WORKHOME}}-${WHOAMI}
|
|||||||
|
|
||||||
## store workdir on allas
|
## store workdir on allas
|
||||||
store:
|
store:
|
||||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --override ${LANGPAIRSTR}
|
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --follow-links --override ${LANGPAIRSTR}
|
||||||
|
|
||||||
## fetch workdir from allas
|
## fetch workdir from allas
|
||||||
fetch:
|
fetch:
|
||||||
@ -218,7 +219,7 @@ fetch:
|
|||||||
|
|
||||||
## store and fetch data dir (raw data files)
|
## store and fetch data dir (raw data files)
|
||||||
store-data:
|
store-data:
|
||||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --override data
|
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --follow-links --override data
|
||||||
|
|
||||||
fetch-data:
|
fetch-data:
|
||||||
mkdir -p ${WORK_DESTDIR}
|
mkdir -p ${WORK_DESTDIR}
|
||||||
|
@ -161,6 +161,7 @@ wiki-iso639-doc:
|
|||||||
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
||||||
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
|
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
|
||||||
|
|
||||||
|
.PHONY: translate-all-wikis
|
||||||
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||||
for w in ${WIKISOURCES}; do \
|
for w in ${WIKISOURCES}; do \
|
||||||
${MAKE} WIKISOURCE=$$w extract-text; \
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
||||||
@ -171,6 +172,7 @@ translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|||||||
fi \
|
fi \
|
||||||
done
|
done
|
||||||
|
|
||||||
|
.PHONY: translate-all-wikiparts
|
||||||
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||||
for w in ${WIKISOURCES}; do \
|
for w in ${WIKISOURCES}; do \
|
||||||
${MAKE} WIKISOURCE=$$w extract-text; \
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
||||||
|
@ -1,5 +1,104 @@
|
|||||||
|
# Back-translation
|
||||||
|
|
||||||
# Translate data as synthetic training data
|
Translate monolingual data (extracted from various wikimedia sources) to create synthetic training data.
|
||||||
|
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Relevant makefiles:
|
||||||
|
|
||||||
|
* [Makefile](Makefile)
|
||||||
|
* [lib/config.mk](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/lib/config.mk)
|
||||||
|
|
||||||
|
Main recipes:
|
||||||
|
|
||||||
|
* `all`: translate wiki data for the specified language
|
||||||
|
* `get-data`:
|
||||||
|
* `extract-text`:
|
||||||
|
* `extract-doc`:
|
||||||
|
* `prepare-model`:
|
||||||
|
* `prepare-data`:
|
||||||
|
* `translate`:
|
||||||
|
* `check-length`:
|
||||||
|
* `print-names`:
|
||||||
|
* `print-modelname`:
|
||||||
|
|
||||||
|
|
||||||
|
Recipes for fetching data and pre-processing batch jobs:
|
||||||
|
|
||||||
|
* `index.html`:
|
||||||
|
* `all-wikitext`:
|
||||||
|
* `all-wikilangs`:
|
||||||
|
* `all-wikilangs-fast`:
|
||||||
|
* `all-wikis-all-langs`:
|
||||||
|
* `all-wikidocs-all-langs`:
|
||||||
|
* `wiki-iso639`: link (shuffled) wikisources to iso639-3 conform language labels
|
||||||
|
* `wiki-iso639-doc`: same as above but for non-shuffled wikisources with document boundaries
|
||||||
|
|
||||||
|
|
||||||
|
Recipes for translating wiki data:
|
||||||
|
|
||||||
|
* `translate-all-parts`:
|
||||||
|
* `translate-all-wikis`:
|
||||||
|
* `translate-all-wikiparts`:
|
||||||
|
* `translate-all-parts-jobs`:
|
||||||
|
* `translate-all-wikis-jobs`:
|
||||||
|
* `translate-all-wikiparts-jobs`:
|
||||||
|
|
||||||
|
|
||||||
|
Recipes for Sami languages:
|
||||||
|
|
||||||
|
* `sami-corp`:
|
||||||
|
* `translate-sami`:
|
||||||
|
* `translate-sami-corp`:
|
||||||
|
* `translate-sami-wiki`:
|
||||||
|
* `translate-sami-xx-wiki`:
|
||||||
|
* `translate-sami-xx-corp`:
|
||||||
|
* `translate-xx-sami-wiki`:
|
||||||
|
|
||||||
|
|
||||||
|
Recipes for Celtic languages:
|
||||||
|
|
||||||
|
* `fetch-celtic`:
|
||||||
|
* `translate-celtic-english`:
|
||||||
|
* `translate-english-celtic`:
|
||||||
|
* `breton`:
|
||||||
|
|
||||||
|
|
||||||
|
Recipes for Nordic and Uralic languages:
|
||||||
|
|
||||||
|
* `finland-focus-wikis`:
|
||||||
|
* `translate-thl`:
|
||||||
|
* `all-nordic-wikidocs`:
|
||||||
|
* `uralic-wiki-texts`:
|
||||||
|
* `uralic-wikis`:
|
||||||
|
|
||||||
|
|
||||||
|
Other task-specific recipes:
|
||||||
|
|
||||||
|
* `xnli-wikidocs`:
|
||||||
|
* `small-romance`:
|
||||||
|
* `wikimedia-focus-wikis`:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Parameters / variables:
|
||||||
|
|
||||||
|
* `SRC`:
|
||||||
|
* `TRG`:
|
||||||
|
* `WIKISOURCE`:
|
||||||
|
* `SPLIT_SIZE`:
|
||||||
|
* `MAX_LENGTH`:
|
||||||
|
* `MAX_SENTENCES`:
|
||||||
|
* `PART`:
|
||||||
|
* `MODELSDIR`:
|
||||||
|
* `MULTI_TARGET_MODEL`:
|
||||||
|
* `WIKI_HOME`:
|
||||||
|
* `WIKIDOC_HOME`:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Detailed information
|
||||||
|
|
||||||
Use Wiki data:
|
Use Wiki data:
|
||||||
|
|
||||||
|
@ -5,10 +5,10 @@ The build targets are all included in various makefiles and the main idea is to
|
|||||||
|
|
||||||
The package includes 4 components:
|
The package includes 4 components:
|
||||||
|
|
||||||
* [basic training](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/Makefile) of bilingual and multilingual models
|
* basic training of bilingual and multilingual models ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/Makefile))
|
||||||
* [back-translation](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/Makefile) for data augmentation
|
* [Generating back-translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/README.md) for data augmentation ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/Makefile))
|
||||||
* [fine-tuning](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/Makefile) for domain adaptation
|
* [Fine-tuning models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/README.md) for domain adaptation ([Makefile](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/Makefile))
|
||||||
* [pivoting](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/Makefile) for data augmentation
|
* [Generate pivot-language-based translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/README.md) for data augmentation ([pivoting](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/Makefile))
|
||||||
|
|
||||||
|
|
||||||
More information about specific tasks:
|
More information about specific tasks:
|
||||||
@ -17,11 +17,8 @@ More information about specific tasks:
|
|||||||
* [Training models](Train.md)
|
* [Training models](Train.md)
|
||||||
* [Testing models](Test.md)
|
* [Testing models](Test.md)
|
||||||
* [Running batch jobs](BatchJobs.md)
|
* [Running batch jobs](BatchJobs.md)
|
||||||
* [Generating back-translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/backtranslate/README.md)
|
|
||||||
* [Fine-tuning models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/finetune/README.md)
|
|
||||||
* [Generate pivot-language-based translations](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/pivoting/README.md)
|
|
||||||
* [Models for the Tatoeba MT Challenge](TatoebaChallenge.md)
|
|
||||||
* [Packaging, releases and storage](ReleaseAndStore.md)
|
* [Packaging, releases and storage](ReleaseAndStore.md)
|
||||||
|
* [Models for the Tatoeba MT Challenge](TatoebaChallenge.md)
|
||||||
|
|
||||||
|
|
||||||
Tutorials (to-do)
|
Tutorials (to-do)
|
||||||
|
17
lib/data.mk
17
lib/data.mk
@ -250,19 +250,18 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|||||||
rm -f ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
|
rm -f ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
|
||||||
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
||||||
echo "extract $$c (${LANGPAIR}) from XML in local OPUS copy"; \
|
echo "extract $$c (${LANGPAIR}) from XML in local OPUS copy"; \
|
||||||
opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} \
|
opus_read ${OPUSREAD_ARGS} -ln -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} \
|
||||||
-dl ${dir $@} -d $$c -s ${SRC} -t ${TRG} \
|
|
||||||
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
|
||||||
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
|
||||||
echo "fetch $$c (${LANGPAIR}) from OPUS"; \
|
|
||||||
opus_read ${OPUSREAD_ARGS} -q -dl ${dir $@} -d $$c -s ${SRC} -t ${TRG} \
|
|
||||||
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||||
else \
|
else \
|
||||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
echo "fetch $$c (${LANGPAIR}) from OPUS"; \
|
||||||
echo "!! skip $@"; \
|
opus_read ${OPUSREAD_ARGS} -ln -q -dl ${TMPDIR} -d $$c -s ${SRC} -t ${TRG} \
|
||||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
-wm moses -p raw -w $@ ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||||
fi )
|
fi )
|
||||||
|
|
||||||
|
# echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||||
|
# echo "!! skip $@"; \
|
||||||
|
# echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||||
|
|
||||||
|
|
||||||
%.${TRGEXT}.raw: %.${SRCEXT}.raw
|
%.${TRGEXT}.raw: %.${SRCEXT}.raw
|
||||||
@echo "done!"
|
@echo "done!"
|
||||||
|
Loading…
Reference in New Issue
Block a user