mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
new elg models
This commit is contained in:
parent
cf8ffd68d9
commit
427c4dee5e
10
lib/data.mk
10
lib/data.mk
@ -48,10 +48,10 @@ ifdef DATA_SAMPLING_WEIGHT
|
||||
ifneq (${wildcard ${WORKDIR}/train/size_per_language_pair.txt},)
|
||||
ifdef MAX_DATA_SIZE
|
||||
FIT_DATA_SIZE = ${shell ${REPOHOME}scripts/data-sample-sizes.pl -w ${DATA_SAMPLING_WEIGHT} -m ${MAX_DATA_SIZE} \
|
||||
${WORKDIR}/train/size_per_language_pair.txt | grep '${SORTED_LANGPAIR}' | cut -f2}
|
||||
${WORKDIR}/train/size_per_language_pair.txt | grep '^${SORTED_LANGPAIR} ' | cut -f2}
|
||||
else
|
||||
FIT_DATA_SIZE = ${shell ${REPOHOME}scripts/data-sample-sizes.pl -w ${DATA_SAMPLING_WEIGHT} \
|
||||
${WORKDIR}/train/size_per_language_pair.txt | grep '${SORTED_LANGPAIR}' | cut -f2}
|
||||
${WORKDIR}/train/size_per_language_pair.txt | grep '^${SORTED_LANGPAIR} ' | cut -f2}
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
@ -59,7 +59,7 @@ endif
|
||||
|
||||
print-data-sampling-size:
|
||||
${REPOHOME}scripts/data-sample-sizes.pl -w 0.3 \
|
||||
${WORKDIR}/train/size_per_language_pair.txt | grep '${SORTED_LANGPAIR}' | cut -f2
|
||||
${WORKDIR}/train/size_per_language_pair.txt | grep '^${SORTED_LANGPAIR} ' | cut -f2
|
||||
@echo "sample size for ${LANGPAIR}: ${FIT_DATA_SIZE}"
|
||||
|
||||
print-data-sampling-sizes:
|
||||
@ -651,7 +651,7 @@ endif
|
||||
@echo "..... add info about training data"
|
||||
@mkdir -p ${dir ${LOCAL_TRAIN_SRC}} ${dir ${LOCAL_TRAIN_TRG}}
|
||||
@echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
@for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
|
||||
for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
|
||||
l=`${GZIP} -cd < $$d ${CUT_DATA_SETS} 2>/dev/null | wc -l`; \
|
||||
if [ $$l -gt 0 ]; then \
|
||||
echo "$$d" | xargs basename | \
|
||||
@ -677,7 +677,7 @@ endif
|
||||
# --> do this when FIT_DATA_SIZE is set!
|
||||
######################################
|
||||
ifeq (${SHUFFLE_DATA},1)
|
||||
@if [ -s ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ]; then \
|
||||
if [ -s ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ]; then \
|
||||
echo "..... shuffle training data"; \
|
||||
paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
|
||||
${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled; \
|
||||
|
@ -398,6 +398,31 @@ elg-new-bigmodels:
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-heb2deu-trainjob
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-vie2deu-trainjob
|
||||
|
||||
elg-new-bigmodels1:
|
||||
rm -f work/deu-fin/train/*.gz work/deu-fin/train/size_per_language_pair.txt
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-deu2fin-trainjob-bt
|
||||
for l in spa fra por ita tur ara zho zls zlw; do \
|
||||
rm -f work/$${l}-fin/train/*.gz work/$${l}-fin/train/size_per_language_pair.txt; \
|
||||
rm -f work/$${l}-deu/train/*.gz work/$${l}-deu/train/size_per_language_pair.txt; \
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big DATA_SAMPLING_WEIGHT=0.3 tatoeba-$${l}2fin-trainjob; \
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big DATA_SAMPLING_WEIGHT=0.3 tatoeba-$${l}2deu-trainjob; \
|
||||
done
|
||||
for l in bat gmq heb vie; do \
|
||||
rm -f work/$${l}-deu/train/*.gz work/$${l}-deu/train/size_per_language_pair.txt; \
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big DATA_SAMPLING_WEIGHT=0.3 tatoeba-$${l}2deu-trainjob; \
|
||||
done
|
||||
|
||||
|
||||
elg-new-bigmodels2:
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-fin2deu-trainjob-bt
|
||||
for l in spa fra por ita tur ara zho zls zlw; do \
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big DATA_SAMPLING_WEIGHT=0.3 tatoeba-fin2$$l-trainjob; \
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big DATA_SAMPLING_WEIGHT=0.3 tatoeba-deu2$$l-trainjob; \
|
||||
done
|
||||
for l in bat gmq heb vie; do \
|
||||
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big DATA_SAMPLING_WEIGHT=0.3 tatoeba-deu2$$l-trainjob; \
|
||||
done
|
||||
|
||||
|
||||
elg-zho:
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-zho2eng-trainjob
|
||||
|
Loading…
Reference in New Issue
Block a user