information about license for pre-trained models added

This commit is contained in:
Joerg Tiedemann 2020-05-15 20:01:07 +03:00
parent cb3b77573e
commit 37a83a9eba
6 changed files with 53 additions and 30 deletions

View File

@ -181,15 +181,6 @@ include lib/models/doclevel.mk
include lib/models/simplify.mk
# include Makefile.env
# include Makefile.config
# include Makefile.dist
# include Makefile.tasks
# include Makefile.data
# include Makefile.doclevel
# include Makefile.generic
# include Makefile.slurm
.PHONY: all
all: ${WORKDIR}/config.mk
@ -199,8 +190,12 @@ all: ${WORKDIR}/config.mk
${MAKE} compare
## TODO: does not look good to remove index.html from backtranlsation dir
## but we need to refresh the file from time to time (new wiki packages!)
#---------------------------------------------------------------------
# run everything including backtranslation of wiki-data
#
## TODO: need to refrehs backtranslate/index.html from time to time!
## ---> necessary for fetching latest wikidump with the correct link
#---------------------------------------------------------------------
.PHONY: all-and-backtranslate
all-and-backtranslate: ${WORKDIR}/config.mk
@ -209,9 +204,11 @@ all-and-backtranslate: ${WORKDIR}/config.mk
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
rm -f backtranslate/index.html
${MAKE} -C backtranslate index.html
${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} all
${MAKE} -C backtranslate \
SRC=${SRC} TRG=${TRG} \
MODELHOME=${MODELDIR} \
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
all
.PHONY: all-and-backtranslate-allwikis
all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
@ -220,20 +217,27 @@ all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
rm -f backtranslate/index.html
${MAKE} -C backtranslate index.html
-${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} all-wikitext
${MAKE} -C backtranslate SRC=${SRC} TRG=${TRG} MODELHOME=${MODELDIR} translate-all-wikis
${MAKE} -C backtranslate \
SRC=${SRC} TRG=${TRG} \
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
MODELHOME=${MODELDIR} \
translate-all-wikis
.PHONY: all-with-bt
all-with-bt:
${MAKE} all
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
all-and-backtranslate
${MAKE} all-bt
#------------------------------------------------------------------------
# create slurm jobs
#------------------------------------------------------------------------
.PHONY: all-job
all-job: ${WORKDIR}/config.mk
${MAKE} data
@ -251,7 +255,6 @@ train-and-eval-job:
# make various data sets (and word alignment)
#------------------------------------------------------------------------
.PHONY: data
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}

View File

@ -3,7 +3,12 @@
This package includes scripts for training NMT models using MarianNMT and OPUS data for [OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT). More details are given in the [Makefile](Makefile) but documentation needs to be improved. Also, the targets require a specific environment and right now only work well on the CSC HPC cluster in Finland.
## Structure
## Pre-trained models
The subdirectory [models](https://github.com/Helsinki-NLP/Opus-MT-train/tree/master/models) contains information about pre-trained models that can be downloaded from this project. They are distribted with a [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/) license.
## Structure of the training scripts
Essential files for making new models:
@ -18,7 +23,7 @@ Essential files for making new models:
There are also make targets for specific models and tasks. Look into `lib/models/` to see what has been defined already.
Note that this frequently changes! There is, for example:
* `lib/models/multilingua.mk`: various multilingual models
* `lib/models/multilingual.mk`: various multilingual models
* `lib/models/celtic.mk`: data and models for Celtic languages
* `lib/models/doclevel.mk`: experimental document-level models
@ -53,6 +58,7 @@ make -j 8 SRCLANG=en TRGLANG=fr data
## Upload to Object Storage
This is only for internal use:
```
swift upload OPUS-MT --changed --skip-identical name-of-file

View File

@ -19,11 +19,16 @@ MULTI_TARGET_MODEL = 0
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE = wiki
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE = 1000000
PART = aa
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH = 100
MAX_SENTENCES = 1000000
PART = aa
MAX_LENGTH = 100
MAX_SENTENCES = ${SPLIT_SIZE}
LANGPAIR = ${SRC}-${TRG}
@ -304,7 +309,7 @@ giellatekno/${SRC}/corp.${SRC}.aa.gz:
find victorio.uit.no/biggies/trunk/langs/${SRC}/corp -type f -regex '.*/[^.]*.txt' |\
xargs cat | grep . | sed 's/ ¶//' |\
$(TOKENIZER)/detokenizer.perl -l fi | \
split -l ${MAX_SENTENCES} - giellatekno/${SRC}/corp.${SRC}.
split -l ${SPLIT_SIZE} - giellatekno/${SRC}/corp.${SRC}.
gzip -f giellatekno/${SRC}/corp.${SRC}.*
victorio.uit.no/biggies/trunk/langs/${SRC}:
@ -567,7 +572,7 @@ ${WIKI_TXT}: ${WIKI_JSON}
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
${SORT} -u | ${SHUFFLE} |\
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}*

View File

@ -11,7 +11,7 @@ MEMAD_LANGS = de en fi fr nl sv
memad-multi:
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" MODELTYPE=transformer data
${MAKE} SRCLANGS="${MEMAD_LANGS}" TRGLANGS="${MEMAD_LANGS}" MODELTYPE=transformer \
WALLTIME=72 HPC_MEM=8g HPC_CORES=1 train.submit-multigpu
WALLTIME=72 HPC_MEM=8g HPC_CORES=1 HPC_DISK=1500 train.submit-multigpu
memad2en:
${MAKE} LANGS="${MEMAD_LANGS}" PIVOT=en all2pivot

View File

@ -63,8 +63,12 @@ allopus2pivot-small:
${MAKE} WALLTIME=72 SRCLANGS="$$l" TRGLANGS=${PIVOT} train-if-small; \
done
train-if-small:
if [ ${BPESIZE} -lt 12000 ]; then \
${MAKE} HPC_CORES=1 HPC_MEM=4g opus-enxx.submit; \
fi
train-if-small-old:
if [ ${BPESIZE} -lt 12000 ]; then \
${MAKE} data; \
${MAKE} train-and-eval-job; \
@ -74,7 +78,6 @@ train-if-small:
## make models with backtranslations in both directions
## for English-to-other language models
##
@ -86,4 +89,6 @@ train-if-small:
opus-enxx:
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} all-and-backtranslate-allwikis
${MAKE} all-and-backtranslate-bt
${MAKE} best_dist
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} all-bt
${MAKE} SRCLANGS=${TRG} TRGLANGS=${SRC} best_dist

View File

@ -1,4 +1,8 @@
# OPUS-MT - models
This is a repository of pre-trained models from the OPUS-MT project
## License
This machine translation model is part of OPUS-MT