mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
information about license for pre-trained models added
This commit is contained in:
parent
cb3b77573e
commit
37a83a9eba
43
Makefile
43
Makefile
@ -181,15 +181,6 @@ include lib/models/doclevel.mk
|
||||
include lib/models/simplify.mk
|
||||
|
||||
|
||||
# include Makefile.env
|
||||
# include Makefile.config
|
||||
# include Makefile.dist
|
||||
# include Makefile.tasks
|
||||
# include Makefile.data
|
||||
# include Makefile.doclevel
|
||||
# include Makefile.generic
|
||||
# include Makefile.slurm
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: ${WORKDIR}/config.mk
|
||||
@ -199,8 +190,12 @@ all: ${WORKDIR}/config.mk
|
||||
${MAKE} compare
|
||||
|
||||
|
||||
## TODO: does not look good to remove index.html from backtranlsation dir
|
||||
## but we need to refresh the file from time to time (new wiki packages!)
|
||||
#---------------------------------------------------------------------
|
||||
# run everything including backtranslation of wiki-data
|
||||
#
|
||||
## TODO: need to refrehs backtranslate/index.html from time to time!
|
||||
## ---> necessary for fetching latest wikidump with the correct link
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
.PHONY: all-and-backtranslate
|
||||
all-and-backtranslate: ${WORKDIR}/config.mk
|
||||
@ -209,9 +204,11 @@ all-and-backtranslate: ${WORKDIR}/config.mk
|
||||
${MAKE} eval
|
||||
${MAKE} compare
|
||||
${MAKE} local-dist
|
||||
rm -f backtranslate/index.html
|
||||
${MAKE} -C backtranslate index.html
|
||||
${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} all
|
||||
${MAKE} -C backtranslate \
|
||||
SRC=${SRC} TRG=${TRG} \
|
||||
MODELHOME=${MODELDIR} \
|
||||
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
|
||||
all
|
||||
|
||||
.PHONY: all-and-backtranslate-allwikis
|
||||
all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
|
||||
@ -220,20 +217,27 @@ all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
|
||||
${MAKE} eval
|
||||
${MAKE} compare
|
||||
${MAKE} local-dist
|
||||
rm -f backtranslate/index.html
|
||||
${MAKE} -C backtranslate index.html
|
||||
-${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} all-wikitext
|
||||
${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} translate-all-wikis
|
||||
|
||||
${MAKE} -C backtranslate \
|
||||
SRC=${SRC} TRG=${TRG} \
|
||||
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
|
||||
MODELHOME=${MODELDIR} \
|
||||
translate-all-wikis
|
||||
|
||||
.PHONY: all-with-bt
|
||||
all-with-bt:
|
||||
${MAKE} all
|
||||
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate
|
||||
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
|
||||
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
|
||||
all-and-backtranslate
|
||||
${MAKE} all-bt
|
||||
|
||||
|
||||
|
||||
#------------------------------------------------------------------------
|
||||
# create slurm jobs
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
.PHONY: all-job
|
||||
all-job: ${WORKDIR}/config.mk
|
||||
${MAKE} data
|
||||
@ -251,7 +255,6 @@ train-and-eval-job:
|
||||
# make various data sets (and word alignment)
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
|
||||
.PHONY: data
|
||||
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz \
|
||||
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
|
||||
|
10
README.md
10
README.md
@ -3,7 +3,12 @@
|
||||
This package includes scripts for training NMT models using MarianNMT and OPUS data for [OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT). More details are given in the [Makefile](Makefile) but documentation needs to be improved. Also, the targets require a specific environment and right now only work well on the CSC HPC cluster in Finland.
|
||||
|
||||
|
||||
## Structure
|
||||
## Pre-trained models
|
||||
|
||||
The subdirectory [models](https://github.com/Helsinki-NLP/Opus-MT-train/tree/master/models) contains information about pre-trained models that can be downloaded from this project. They are distribted with a [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/) license.
|
||||
|
||||
|
||||
## Structure of the training scripts
|
||||
|
||||
Essential files for making new models:
|
||||
|
||||
@ -18,7 +23,7 @@ Essential files for making new models:
|
||||
There are also make targets for specific models and tasks. Look into `lib/models/` to see what has been defined already.
|
||||
Note that this frequently changes! There is, for example:
|
||||
|
||||
* `lib/models/multilingua.mk`: various multilingual models
|
||||
* `lib/models/multilingual.mk`: various multilingual models
|
||||
* `lib/models/celtic.mk`: data and models for Celtic languages
|
||||
* `lib/models/doclevel.mk`: experimental document-level models
|
||||
|
||||
@ -53,6 +58,7 @@ make -j 8 SRCLANG=en TRGLANG=fr data
|
||||
|
||||
## Upload to Object Storage
|
||||
|
||||
This is only for internal use:
|
||||
|
||||
```
|
||||
swift upload OPUS-MT --changed --skip-identical name-of-file
|
||||
|
@ -19,11 +19,16 @@ MULTI_TARGET_MODEL = 0
|
||||
## can be general wikipedia, wikinews, wikibooks, ...
|
||||
WIKISOURCE = wiki
|
||||
|
||||
## split size in nr-of-lines
|
||||
## default part to be selected = aa
|
||||
SPLIT_SIZE = 1000000
|
||||
PART = aa
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
## maximum number of sentences to be translated (top N lines)
|
||||
MAX_LENGTH = 100
|
||||
MAX_SENTENCES = 1000000
|
||||
PART = aa
|
||||
MAX_SENTENCES = ${SPLIT_SIZE}
|
||||
|
||||
|
||||
LANGPAIR = ${SRC}-${TRG}
|
||||
|
||||
@ -304,7 +309,7 @@ giellatekno/${SRC}/corp.${SRC}.aa.gz:
|
||||
find victorio.uit.no/biggies/trunk/langs/${SRC}/corp -type f -regex '.*/[^.]*.txt' |\
|
||||
xargs cat | grep . | sed 's/ ¶//' |\
|
||||
$(TOKENIZER)/detokenizer.perl -l fi | \
|
||||
split -l ${MAX_SENTENCES} - giellatekno/${SRC}/corp.${SRC}.
|
||||
split -l ${SPLIT_SIZE} - giellatekno/${SRC}/corp.${SRC}.
|
||||
gzip -f giellatekno/${SRC}/corp.${SRC}.*
|
||||
|
||||
victorio.uit.no/biggies/trunk/langs/${SRC}:
|
||||
@ -567,7 +572,7 @@ ${WIKI_TXT}: ${WIKI_JSON}
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
|
||||
${SORT} -u | ${SHUFFLE} |\
|
||||
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
|
||||
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
|
||||
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
||||
|
||||
|
||||
|
@ -11,7 +11,7 @@ MEMAD_LANGS = de en fi fr nl sv
|
||||
memad-multi:
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" MODELTYPE=transformer data
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" MODELTYPE=transformer \
|
||||
WALLTIME=72 HPC_MEM=8g HPC_CORES=1 train.submit-multigpu
|
||||
WALLTIME=72 HPC_MEM=8g HPC_CORES=1 HPC_DISK=1500 train.submit-multigpu
|
||||
|
||||
memad2en:
|
||||
${MAKE} LANGS="${MEMAD_LANGS}" PIVOT=en all2pivot
|
||||
|
@ -63,8 +63,12 @@ allopus2pivot-small:
|
||||
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=${PIVOT} train-if-small; \
|
||||
done
|
||||
|
||||
|
||||
train-if-small:
|
||||
if [ ${BPESIZE} -lt 12000 ]; then \
|
||||
${MAKE} HPC_CORES=1 HPC_MEM=4g opus-enxx.submit; \
|
||||
fi
|
||||
|
||||
train-if-small-old:
|
||||
if [ ${BPESIZE} -lt 12000 ]; then \
|
||||
${MAKE} data; \
|
||||
${MAKE} train-and-eval-job; \
|
||||
@ -74,7 +78,6 @@ train-if-small:
|
||||
|
||||
|
||||
|
||||
|
||||
## make models with backtranslations in both directions
|
||||
## for English-to-other language models
|
||||
##
|
||||
@ -86,4 +89,6 @@ train-if-small:
|
||||
opus-enxx:
|
||||
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} all-and-backtranslate-allwikis
|
||||
${MAKE} all-and-backtranslate-bt
|
||||
${MAKE} best_dist
|
||||
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} all-bt
|
||||
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} best_dist
|
||||
|
@ -1,4 +1,8 @@
|
||||
|
||||
# OPUS-MT - models
|
||||
|
||||
This is a repository of pre-trained models from the OPUS-MT project
|
||||
|
||||
## License
|
||||
|
||||
This machine translation model is part of OPUS-MT
|
Loading…
Reference in New Issue
Block a user