mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 13:11:38 +03:00
balance dev data in multiligual models and a bug fixed in preprocess script
This commit is contained in:
parent
3cd0bd3f75
commit
cde8f0d0af
@ -27,13 +27,6 @@ ifdef LANGPAIRS
|
||||
endif
|
||||
|
||||
|
||||
## final default is sv-fi
|
||||
## NEW: don't do this ... can create confusion ...
|
||||
##
|
||||
# SRCLANGS ?= sv
|
||||
# TRGLANGS ?= fi
|
||||
|
||||
|
||||
## set SRC and TRG unless they are specified already
|
||||
ifneq (${words ${SRCLANGS}},1)
|
||||
SRC ?= multi
|
||||
@ -54,43 +47,66 @@ endif
|
||||
# TRG ?= ${lastword ${TRGLANGS}}
|
||||
|
||||
|
||||
##----------------------------------------------------------------------
|
||||
## SKIP_LANGPAIRS can be used to skip certain language pairs
|
||||
## in data preparation for multilingual models
|
||||
## ---> this can be good to skip BIG language pairs
|
||||
## that would very much dominate all the data
|
||||
## must be a pattern that can be matched by egrep
|
||||
## e.g. en-de|en-fr
|
||||
##----------------------------------------------------------------------
|
||||
|
||||
SKIP_LANGPAIRS ?= "nothing"
|
||||
|
||||
|
||||
##----------------------------------------------------------------------
|
||||
## set SHUFFLE_DATA if you want to shuffle data for
|
||||
## each language pair to be added to the training data
|
||||
## --> especially useful in connection with FIT_DATA_SIZE
|
||||
##
|
||||
##----------------------------------------------------------------------
|
||||
|
||||
# SHUFFLE_DATA = 1
|
||||
|
||||
## devtest data is shuffled by default
|
||||
SHUFFLE_DEVDATA = 1
|
||||
|
||||
|
||||
##----------------------------------------------------------------------
|
||||
## set FIT_DATA_SIZE to a specific value to fit the training data
|
||||
## to a certain number of lines for each language pair in the collection
|
||||
## --> especially useful for multilingual models for balancing the
|
||||
## the size for each language pair
|
||||
## the script does both, over- and undersampling
|
||||
##
|
||||
##----------------------------------------------------------------------
|
||||
|
||||
# FIT_DATA_SIZE = 100000
|
||||
|
||||
## similar for the dev data: set FIT_DEVDATA_SIZE to
|
||||
## balance the size of the devdata for each language pair
|
||||
##
|
||||
# FIT_DEVDATA_SIZE =
|
||||
|
||||
## define a default dev size fit for multilingual models
|
||||
## TODO: is 1000 too small? or too big?
|
||||
## TODO: should this depend on the number of languages involved?
|
||||
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
FIT_DEVDATA_SIZE ?= 1000
|
||||
endif
|
||||
ifneq (${words ${SRCLANGS}},1)
|
||||
FIT_DEVDATA_SIZE ?= 1000
|
||||
endif
|
||||
|
||||
## maximum number of repeating the same data set
|
||||
## in oversampling
|
||||
MAX_OVER_SAMPLING ?= 50
|
||||
|
||||
|
||||
##----------------------------------------------------------------------
|
||||
## set CHECK_TRAINDATA_SIZE if you want to check that each
|
||||
## bitext has equal number of lines in source and target
|
||||
## ---> this only prints a warning if not
|
||||
##
|
||||
##----------------------------------------------------------------------
|
||||
# CHECK_TRAINDATA_SIZE
|
||||
|
||||
|
||||
@ -134,11 +150,10 @@ endif
|
||||
DEVSIZE = 2500
|
||||
TESTSIZE = 2500
|
||||
|
||||
## NEW: significantly reduce devminsize
|
||||
## (= absolute minimum we need as devdata)
|
||||
## NEW: define an alternative small size for DEV and TEST
|
||||
## OLD DEVMINSIZE:
|
||||
# DEVMINSIZE = 1000
|
||||
## set some additional thresholds for
|
||||
## the size of test and dev data
|
||||
## DEVMINSIZE is the absolute minimum we require
|
||||
## to run any training procedures
|
||||
|
||||
DEVSMALLSIZE = 1000
|
||||
TESTSMALLSIZE = 1000
|
||||
@ -154,24 +169,8 @@ OPUSREAD_ARGS =
|
||||
## resources in OPUS
|
||||
##----------------------------------------------------------------------------
|
||||
|
||||
## OLD: get corpora directly from the file system
|
||||
#
|
||||
# ELRA_CORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
|
||||
# ${patsubst ${OPUSHOME}/%,%,\
|
||||
# ${shell ls ${OPUSHOME}/ELRA-*/latest/xml/${LANGPAIR}.xml.gz 2>/dev/null}}}
|
||||
#
|
||||
# EXCLUDE_CORPORA ?= WMT-News MPC1 ${ELRA_CORPORA}
|
||||
#
|
||||
# OPUSCORPORA = $(filter-out ${EXCLUDE_CORPORA},${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
|
||||
# ${patsubst ${OPUSHOME}/%,%,\
|
||||
# ${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz 2>/dev/null}}})
|
||||
#
|
||||
# OPUSMONOCORPORA = $(filter-out ${EXCLUDE_CORPORA} ,${patsubst %/latest/mono/${LANGID}.txt.gz,%,\
|
||||
# ${patsubst ${OPUSHOME}/%,%,\
|
||||
# ${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}})
|
||||
|
||||
|
||||
## NEW: get data from the OPUS-API
|
||||
## get available data from the OPUS-API
|
||||
|
||||
OPUSAPI = http://opus.nlpl.eu/opusapi/
|
||||
|
||||
@ -185,7 +184,6 @@ get-elra-bitexts = ${shell wget -qq -O - ${OPUSAPI}?source=${1}\&target=${2}\&
|
||||
${JQ} '.corpora[]' | tr '"' ' ' | grep '^ *ELR[CA][-_]'}
|
||||
|
||||
|
||||
|
||||
## start of some functions to check whether there is a resource for downloading
|
||||
## open question: links to the latest release do not exist in the storage
|
||||
## --> would it be better to get that done via the OPUS API?
|
||||
@ -362,8 +360,8 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
|
||||
|
||||
MODEL_SUBDIR =
|
||||
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
|
||||
# MODELTYPE = transformer-align
|
||||
MODELTYPE = transformer
|
||||
MODELTYPE = transformer-align
|
||||
# MODELTYPE = transformer
|
||||
NR = 1
|
||||
|
||||
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
|
||||
@ -597,6 +595,9 @@ ifdef SHUFFLE_DATA
|
||||
endif
|
||||
ifdef FIT_DATA_SIZE
|
||||
echo "FIT_DATA_SIZE = ${FIT_DATA_SIZE}" >> $@
|
||||
endif
|
||||
ifdef FIT_DEVDATA_SIZE
|
||||
echo "FIT_DEVDATA_SIZE = ${FIT_DEVDATA_SIZE}" >> $@
|
||||
endif
|
||||
echo "MAX_OVER_SAMPLING = ${MAX_OVER_SAMPLING}" >> $@
|
||||
echo "USE_REST_DEVDATA = ${USE_REST_DEVDATA}" >> $@
|
||||
|
79
lib/data.mk
79
lib/data.mk
@ -9,7 +9,10 @@
|
||||
# - shuffle dev/test data and divide into to disjoint sets
|
||||
# - reverse data sets for the other translation direction (bilingual models only)
|
||||
# - run word alignment if necessary (models with guided alignment = transformer-align)
|
||||
|
||||
#
|
||||
#
|
||||
# TODO: write data info to some model-specific file insetad of README.md
|
||||
# (applies for train/val/test!)
|
||||
|
||||
|
||||
## training data size (generates count if not in README.md)
|
||||
@ -348,6 +351,14 @@ endif
|
||||
endif
|
||||
|
||||
|
||||
## add language labels to the source language
|
||||
## if we have multiple target languages
|
||||
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
LABEL_SOURCE_DATA = | sed "s/^/>>${TRG}<< /"
|
||||
endif
|
||||
|
||||
|
||||
## add to the training data
|
||||
|
||||
add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
|
||||
@ -377,17 +388,10 @@ endif
|
||||
done
|
||||
echo "" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
######################################
|
||||
# do we need to add target language labels?
|
||||
# create local data files (add label if necessary)
|
||||
######################################
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
@echo "set target language labels";
|
||||
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} 2>/dev/null |\
|
||||
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
else
|
||||
@echo "only one target language"
|
||||
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} 2>/dev/null \
|
||||
> ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
endif
|
||||
${LABEL_SOURCE_DATA} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
@${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} 2>/dev/null \
|
||||
> ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
######################################
|
||||
@ -442,6 +446,10 @@ show-devdata:
|
||||
raw-devdata: ${DEV_SRC} ${DEV_TRG}
|
||||
|
||||
|
||||
## TODO: should we have some kind of balanced shuffling
|
||||
## to avoid bias towards bigger language pairs?
|
||||
## maybe introduce over/undersampling of dev data like we have for train data?
|
||||
|
||||
${DEV_SRC}.shuffled.gz:
|
||||
mkdir -p ${dir $@}
|
||||
rm -f ${DEV_SRC} ${DEV_TRG}
|
||||
@ -461,13 +469,13 @@ ifeq (${SHUFFLE_DEVDATA},0)
|
||||
else
|
||||
paste ${DEV_SRC} ${DEV_TRG} | ${UNIQ} | ${SHUFFLE} | ${GZIP} -c > $@
|
||||
endif
|
||||
echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md
|
||||
${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
echo -n "* total-size-shuffled: " >> ${dir ${DEV_SRC}}README.md
|
||||
${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
|
||||
## OLD: don't uniq the dev-data ...
|
||||
##
|
||||
# paste ${DEV_SRC} ${DEV_TRG} | ${SHUFFLE} | ${GZIP} -c > $@
|
||||
|
||||
# echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md
|
||||
|
||||
|
||||
## if we have less than twice the amount of DEVMINSIZE in the data set
|
||||
@ -513,17 +521,17 @@ else
|
||||
@${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz
|
||||
endif
|
||||
@echo "" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo -n "* devset-selected: top " >> ${dir ${DEV_SRC}}/README.md
|
||||
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
ifeq (${DEVSET},${TESTSET})
|
||||
@echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo -n "* testset-selected: next " >> ${dir ${DEV_SRC}}/README.md
|
||||
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "* devset-unused: added to traindata" >> ${dir ${DEV_SRC}}/README.md
|
||||
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
|
||||
@echo "" >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo -n "testset-selected: next " >> ${dir ${TEST_SRC}}/README.md
|
||||
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
|
||||
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
|
||||
endif
|
||||
@ -538,15 +546,20 @@ add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
|
||||
@mkdir -p ${dir ${DEV_SRC}}
|
||||
@echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
@echo "more than one target language";
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
|
||||
#-----------------------------------------------------------------
|
||||
# sample devdata to balance size between different language pairs
|
||||
# (only if FIT_DEVDATA_SIZE is set)
|
||||
#-----------------------------------------------------------------
|
||||
ifdef FIT_DEVDATA_SIZE
|
||||
@echo "sample dev data to fit size = ${FIT_DEVDATA_SIZE}"
|
||||
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DEVDATA_SIZE} \
|
||||
${CLEAN_DEV_SRC} 2>/dev/null ${LABEL_SOURCE_DATA} >> ${DEV_SRC}
|
||||
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DEVDATA_SIZE} \
|
||||
${CLEAN_DEV_TRG} 2>/dev/null >> ${DEV_TRG}
|
||||
else
|
||||
@echo "only one target language"
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null >> ${DEV_SRC}
|
||||
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null ${LABEL_SOURCE_DATA} >> ${DEV_SRC}
|
||||
@${ZCAT} ${CLEAN_DEV_TRG} 2>/dev/null >> ${DEV_TRG}
|
||||
endif
|
||||
@${ZCAT} ${CLEAN_DEV_TRG} 2>/dev/null >> ${DEV_TRG}
|
||||
|
||||
|
||||
####################
|
||||
@ -583,8 +596,8 @@ ifneq (${TESTSET},${DEVSET})
|
||||
paste ${TEST_SRC} ${TEST_TRG} | ${SHUFFLE} | ${GZIP} -c > $@.shuffled.gz; \
|
||||
${GZIP} -cd < $@.shuffled.gz | cut -f1 | tail -${TESTSIZE} > ${TEST_SRC}; \
|
||||
${GZIP} -cd < $@.shuffled.gz | cut -f2 | tail -${TESTSIZE} > ${TEST_TRG}; \
|
||||
echo "" >> ${dir $@}/README.md; \
|
||||
echo "testset = top ${TESTSIZE} lines of $@.shuffled!" >> ${dir $@}/README.md; \
|
||||
echo "" >> ${dir $@}/README.md; \
|
||||
echo "testset-selected: top ${TESTSIZE} lines of $@.shuffled!" >> ${dir $@}/README.md; \
|
||||
fi \
|
||||
else \
|
||||
echo "test set $@ exists already! Don't overwrite!"; \
|
||||
@ -611,18 +624,8 @@ ${TEST_TRG}: ${TEST_SRC}
|
||||
add-to-test-data: ${CLEAN_TEST_SRC}
|
||||
@echo "add to testset: ${CLEAN_TEST_SRC}"
|
||||
@echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
@echo "more than one target language";
|
||||
@echo "${ZCAT} ${CLEAN_TEST_SRC} | sed 's/^/>>${TRG}<< /' >> ${TEST_SRC}"
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
|
||||
else
|
||||
@echo "only one target language"
|
||||
@echo "${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}"
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null >> ${TEST_SRC}
|
||||
endif
|
||||
@echo "${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}"
|
||||
@${ZCAT} ${CLEAN_TEST_TRG} 2>/dev/null >> ${TEST_TRG}
|
||||
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null ${LABEL_SOURCE_DATA} >> ${TEST_SRC}
|
||||
@${ZCAT} ${CLEAN_TEST_TRG} 2>/dev/null >> ${TEST_TRG}
|
||||
|
||||
|
||||
|
||||
|
77
lib/dist.mk
77
lib/dist.mk
@ -200,11 +200,15 @@ RAWSRCLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${SRCLANGS}}}
|
||||
RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}}}}
|
||||
|
||||
## language labels in multilingual models
|
||||
## (NEW: take them directly from the model vocabulary
|
||||
## tto avoid listing labels that are not used)
|
||||
|
||||
# LANGUAGELABELS = ${patsubst %,>>%<<,${TRGLANGS}}
|
||||
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
|
||||
|
||||
## BETTER: take them directly from the model vocabulary!
|
||||
## advantage: list all labels that are valid in the model
|
||||
## disadvantage: can be misleading because we may have labels that are not trained
|
||||
##
|
||||
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
|
||||
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
|
||||
|
||||
|
||||
|
||||
model-yml: ${MODEL_YML}
|
||||
@ -244,18 +248,24 @@ ${MODEL_YML}: ${MODEL_FINAL}
|
||||
@echo "release-date: $(DATE)" >> $@
|
||||
@echo "dataset-name: $(DATASET)" >> $@
|
||||
@echo "modeltype: $(MODELTYPE)" >> $@
|
||||
@echo "vocabulary:" >> $@
|
||||
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
|
||||
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
|
||||
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
|
||||
@echo "subwords:" >> $@
|
||||
@echo " - source: ${PRE_SRC}" >> $@
|
||||
@echo " - target: ${PRE_TRG}" >> $@
|
||||
@echo " source: ${PRE_SRC}" >> $@
|
||||
@echo " target: ${PRE_TRG}" >> $@
|
||||
@echo "subword-models:" >> $@
|
||||
@echo " - source: source.${SUBWORD_TYPE}" >> $@
|
||||
@echo " - target: target.${SUBWORD_TYPE}" >> $@
|
||||
@echo " source: source.${SUBWORD_TYPE}" >> $@
|
||||
@echo " target: target.${SUBWORD_TYPE}" >> $@
|
||||
ifdef USE_TARGET_LABELS
|
||||
@echo "use-target-labels:" >> $@
|
||||
@for t in ${TRGLANGS}; do \
|
||||
echo " - >>$$t<<" >> $@; \
|
||||
@for t in ${LANGUAGELABELSRAW}; do \
|
||||
echo " - \">>$$t<<\"" >> $@; \
|
||||
done
|
||||
# @for t in ${TRGLANGS}; do \
|
||||
# echo " - '>>$$t<<'" >> $@; \
|
||||
# done
|
||||
endif
|
||||
@echo "source-languages:" >> $@
|
||||
@for s in ${RAWSRCLANGS}; do\
|
||||
@ -271,13 +281,13 @@ ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
|
||||
tr "#" "\n" | grep '^ ${DATASET}~' | \
|
||||
tail -1 | tr "~" "\n" | grep '^\* ' | \
|
||||
grep -v ': *$$' | grep -v ' 0$$' | \
|
||||
grep -v 'total size' | sed 's/^\* / - /' >> $@
|
||||
grep -v 'total size' | sed 's/^\* / /' >> $@
|
||||
endif
|
||||
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
|
||||
@echo "validation-data:" >> $@
|
||||
grep '^\* ' ${WORKDIR}/val/README.md | \
|
||||
grep -v ' 0$$' | \
|
||||
sed 's/^\* / - /' >> $@
|
||||
sed 's/^\* / /' >> $@
|
||||
endif
|
||||
##-----------------------------
|
||||
## add benchmark results
|
||||
@ -301,11 +311,11 @@ ifneq ("$(wildcard ${TEST_EVALUATION})","")
|
||||
cut -f7 -d ' ' > $@.6
|
||||
@paste -d '/' $@.4 $@.5 > $@.7
|
||||
@echo "test-data:" >> $@
|
||||
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> $@
|
||||
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ /;' >> $@
|
||||
@echo "BLEU-scores:" >> $@
|
||||
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> $@
|
||||
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ /' >> $@
|
||||
@echo "chr-F-scores:" >> $@
|
||||
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> $@
|
||||
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ /' >> $@
|
||||
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7
|
||||
endif
|
||||
|
||||
@ -785,7 +795,7 @@ ${EVALTRANSL}: # ${WORKHOME}/eval/%.test.txt: ${MODELSHOME}/%.compare
|
||||
|
||||
|
||||
######################################################################
|
||||
## handle old models in previous work directories
|
||||
## misc recipes ... all kind of fixes
|
||||
## obsolete now?
|
||||
######################################################################
|
||||
|
||||
@ -948,3 +958,38 @@ remove-underperforming:
|
||||
echo "keep $$d"; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
dist-remove-no-date-dist:
|
||||
swift list ${MODEL_CONTAINER} > index.txt
|
||||
for d in `grep opus.zip index.txt`; do \
|
||||
swift delete ${MODEL_CONTAINER} $$d; \
|
||||
done
|
||||
|
||||
dist-remove-old-yml:
|
||||
swift list Tatoeba-MT-models > index.txt
|
||||
for d in `grep old-yml index.txt`; do \
|
||||
swift delete Tatoeba-MT-models $$d; \
|
||||
done
|
||||
|
||||
dist-fix-preprocess:
|
||||
mkdir -p tmp
|
||||
( cd tmp; \
|
||||
swift list Tatoeba-MT-models > index.txt; \
|
||||
for d in `grep '.zip' index.txt`; do \
|
||||
echo "check $$d ..."; \
|
||||
swift download Tatoeba-MT-models $$d; \
|
||||
unzip $$d preprocess.sh; \
|
||||
mv preprocess.sh preprocess-old.sh; \
|
||||
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
|
||||
< preprocess-old.sh > preprocess.sh; \
|
||||
chmod +x ${dir $@}/preprocess.sh; \
|
||||
if [ `diff preprocess-old.sh preprocess.sh | wc -l` -gt 0 ]; then \
|
||||
echo "replace old preprocess in $$d and upload again"; \
|
||||
zip -u $$d preprocess.sh; \
|
||||
swift upload Tatoeba-MT-models --changed --skip-identical $$d; \
|
||||
fi; \
|
||||
rm -f preprocess.sh; \
|
||||
rm -f $$d; \
|
||||
done )
|
||||
|
||||
|
@ -129,8 +129,8 @@ else ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
EFLOMAL_HOME = ${APPLHOME}/eflomal/
|
||||
MARIAN_HOME = ${APPLHOME}/marian-dev/build/
|
||||
MARIAN = ${APPLHOME}/marian-dev/build
|
||||
MARIAN_HOME = ${APPLHOME}/marian/build/
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
SPM_HOME = ${MARIAN_HOME}
|
||||
GPU = v100
|
||||
GPU_MODULES = python-env
|
||||
|
@ -21,8 +21,9 @@ AMERICASNLP_TRGALL = $(sort ${AMERICASNLP_TRG} ${AMERICASNLP_TRG_EXTRA} \
|
||||
# AMERICASNLP_TRGALL = ${sort ${AMERICASNLP_TRG} ${AMERICASNLP_TRG_EXTRA} ${LANGGROUP_NAI} ${LANGGROUP_SAI}}
|
||||
|
||||
|
||||
AMERICASNLP_BPESIZE = 32000
|
||||
AMERICASNLP_FIT_DATA_SIZE = 100000
|
||||
AMERICASNLP_PIVOT = en
|
||||
AMERICASNLP_PIVOT = en
|
||||
|
||||
|
||||
# /scratch/project_2001194/yves/americas/backtrans/merged/*.dedup.*
|
||||
@ -78,6 +79,27 @@ americasnlp-testdata:
|
||||
done
|
||||
|
||||
|
||||
AMERICASNLP_YVES_SPLITDIR = /scratch/project_2001194/yves/americas/backtrans/split_dev
|
||||
|
||||
americasnlp-finetunedata:
|
||||
for l in aym bzd cni gn hch nah oto quy shp tar; do \
|
||||
if [ $$l \< es ]; then p="$$l-es"; else p="es-$$l"; fi; \
|
||||
head -50 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.$$l |\
|
||||
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunedev.$$p.clean.$$l.gz; \
|
||||
head -50 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.es |\
|
||||
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunedev.$$p.clean.es.gz; \
|
||||
tail -n +51 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.$$l |\
|
||||
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetrain.$$p.clean.$$l.gz; \
|
||||
tail -n +51 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.es |\
|
||||
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetrain.$$p.clean.es.gz; \
|
||||
gzip -c < ${AMERICASNLP_YVES_SPLITDIR}/dev2_$$l.$$l \
|
||||
> ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetest.$$p.clean.$$l.gz; \
|
||||
gzip -c < ${AMERICASNLP_YVES_SPLITDIR}/dev2_$$l.es \
|
||||
> ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetest.$$p.clean.es.gz; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
AMERICASNLP_YVES_BTDIR = /scratch/project_2001194/yves/americas/backtrans/merged
|
||||
|
||||
americasnlp-btdata:
|
||||
@ -97,9 +119,9 @@ americasnlp-btdata:
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=32000 \
|
||||
TRGBPESIZE=32000 \
|
||||
BPESIZE=32000 \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
MARIAN_VALID_FREQ=2500 \
|
||||
DATASET=americasnlp \
|
||||
@ -123,29 +145,101 @@ americasnlp-btdata:
|
||||
MARIAN_EARLY_STOPPING=${TUNE_EARLY_STOPPING} \
|
||||
MARIAN_EXTRA='-e 5 --no-restore-corpus' \
|
||||
GPUJOB_SUBMIT=${TUNE_GPUJOB_SUBMIT} \
|
||||
DATASET=americasnlp-tuned4${TUNE_SRC}2${TUNE_TRG} \
|
||||
DATASET=opus-americasnlp-tuned4${TUNE_SRC}2${TUNE_TRG} \
|
||||
SRCLANGS="${TUNE_SRC}" \
|
||||
TRGLANGS="${TUNE_TRG}" \
|
||||
USE_TARGET_LABELS=1 \
|
||||
USE_REST_DEVDATA=0 \
|
||||
WORKHOME=${AMERICASNLP_WORK} \
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=32000 \
|
||||
TRGBPESIZE=32000 \
|
||||
BPESIZE=32000 \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
TRAINSET=americasnlp2021-train \
|
||||
EXTRA_TRAINSET=americasnlp2021-train \
|
||||
DEVSET=americasnlp2021-dev \
|
||||
TESTSET=americasnlp2021-test \
|
||||
DEVSET_NAME=americasnlp2021-dev \
|
||||
TESTSET_NAME=americasnlp2021-test \
|
||||
FIT_DATA_SIZE=${AMERICASNLP_FIT_DATA_SIZE} \
|
||||
SHUFFLE_DATA=1 \
|
||||
LANGPAIRSTR="es-xx" \
|
||||
TRAINSET=americasnlp2021-tunetrain \
|
||||
EXTRA_TRAINSET=americasnlp2021-tunetrain \
|
||||
DEVSET=americasnlp2021-tunedev \
|
||||
TESTSET=americasnlp2021-tunetest \
|
||||
DEVSET_NAME=americasnlp2021-tunedev \
|
||||
TESTSET_NAME=americasnlp2021-tunetest \
|
||||
LANGPAIRSTR="es+en-xx" \
|
||||
${@:-americasnlp-langtune=}
|
||||
|
||||
|
||||
## tune for all languages
|
||||
|
||||
%-americasnlp-tune:
|
||||
make MODEL_LATEST=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp.spm32k-spm32k.transformer-align.model1.npz.best-perplexity.npz \
|
||||
MODEL_LATEST_VOCAB=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp.spm32k-spm32k.vocab.yml \
|
||||
MARIAN_VALID_FREQ=${TUNE_VALID_FREQ} \
|
||||
MARIAN_DISP_FREQ=${TUNE_DISP_FREQ} \
|
||||
MARIAN_SAVE_FREQ=${TUNE_SAVE_FREQ} \
|
||||
MARIAN_EARLY_STOPPING=${TUNE_EARLY_STOPPING} \
|
||||
MARIAN_EXTRA='-e 5 --no-restore-corpus' \
|
||||
GPUJOB_SUBMIT=${TUNE_GPUJOB_SUBMIT} \
|
||||
DATASET=opus-americasnlp-tuned4es2all \
|
||||
SRCLANGS="es" \
|
||||
TRGLANGS="aym bzd cni gn hch nah oto quy shp tar" \
|
||||
USE_TARGET_LABELS=1 \
|
||||
USE_REST_DEVDATA=0 \
|
||||
WORKHOME=${AMERICASNLP_WORK} \
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
TRAINSET=americasnlp2021-tunetrain \
|
||||
EXTRA_TRAINSET=americasnlp2021-tunetrain \
|
||||
DEVSET=americasnlp2021-tunedev \
|
||||
TESTSET=americasnlp2021-tunetest \
|
||||
DEVSET_NAME=americasnlp2021-tunedev \
|
||||
TESTSET_NAME=americasnlp2021-tunetest \
|
||||
LANGPAIRSTR="es+en-xx" \
|
||||
${@:-americasnlp-tune=}
|
||||
|
||||
|
||||
%-americasnlp-tunebt:
|
||||
make MODEL_LATEST=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp+bt.spm32k-spm32k.transformer-align.model1.npz.best-perplexity.npz \
|
||||
MODEL_LATEST_VOCAB=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp+bt.spm32k-spm32k.vocab.yml \
|
||||
MARIAN_VALID_FREQ=${TUNE_VALID_FREQ} \
|
||||
MARIAN_DISP_FREQ=${TUNE_DISP_FREQ} \
|
||||
MARIAN_SAVE_FREQ=${TUNE_SAVE_FREQ} \
|
||||
MARIAN_EARLY_STOPPING=${TUNE_EARLY_STOPPING} \
|
||||
MARIAN_EXTRA='-e 5 --no-restore-corpus' \
|
||||
GPUJOB_SUBMIT=${TUNE_GPUJOB_SUBMIT} \
|
||||
DATASET=opus-americasnlp+bt-tuned4es2all \
|
||||
SRCLANGS="es" \
|
||||
TRGLANGS="aym bzd cni gn hch nah oto quy shp tar" \
|
||||
USE_TARGET_LABELS=1 \
|
||||
USE_REST_DEVDATA=0 \
|
||||
WORKHOME=${AMERICASNLP_WORK} \
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
TRAINSET=americasnlp2021-tunetrain \
|
||||
EXTRA_TRAINSET=americasnlp2021-tunetrain \
|
||||
DEVSET=americasnlp2021-tunedev \
|
||||
TESTSET=americasnlp2021-tunetest \
|
||||
DEVSET_NAME=americasnlp2021-tunedev \
|
||||
TESTSET_NAME=americasnlp2021-tunetest \
|
||||
LANGPAIRSTR="es+en-xx" \
|
||||
${@:-americasnlp-tunebt=}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
%-americasnlp-reverse:
|
||||
make TRGLANGS="${AMERICASNLP_SRC}" \
|
||||
@ -154,9 +248,9 @@ americasnlp-btdata:
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=32000 \
|
||||
TRGBPESIZE=32000 \
|
||||
BPESIZE=32000 \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
MARIAN_VALID_FREQ=2500 \
|
||||
DATASET=americasnlp \
|
||||
@ -179,9 +273,9 @@ americasnlp-btdata:
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=32000 \
|
||||
TRGBPESIZE=32000 \
|
||||
BPESIZE=32000 \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
DATASET=opus-americasnlp \
|
||||
EXTRA_TRAINSET=americasnlp2021-train \
|
||||
@ -201,9 +295,9 @@ americasnlp-btdata:
|
||||
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
|
||||
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
|
||||
MODELTYPE=transformer-align \
|
||||
SRCBPESIZE=32000 \
|
||||
TRGBPESIZE=32000 \
|
||||
BPESIZE=32000 \
|
||||
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
BPESIZE=${AMERICASNLP_BPESIZE} \
|
||||
GPUJOB_HPC_MEM=8g \
|
||||
DATASET=opus-americasnlp \
|
||||
EXTRA_TRAINSET=americasnlp2021-train \
|
||||
|
@ -134,10 +134,11 @@ endif
|
||||
ifneq (${wildcard ${TATOEBA_TRGLABELFILE}},)
|
||||
TATOEBA_TRGLANGS = ${shell cat ${TATOEBA_TRGLABELFILE}}
|
||||
endif
|
||||
ifndef USE_TARGET_LABELS
|
||||
# ifndef USE_TARGET_LABELS
|
||||
ifdef TATOEBA_TRGLANGS
|
||||
ifneq (${words ${TATOEBA_TRGLANGS}},1)
|
||||
USE_TARGET_LABELS = 1
|
||||
TARGET_LABELS = $(patsubst %,>>%<<,${TATOEBA_TRGLANGS})
|
||||
TARGET_LABELS = $(patsubst %,>>%<<,$(sort ${TATOEBA_TRGLANGS} ${TATOEBA_TRGLANG_GROUP}))
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -616,6 +617,12 @@ find-srclanggroup = $(call find-langgroup,$(firstword ${subst -, ,${subst 2, ,${
|
||||
find-trglanggroup = $(call find-langgroup,$(lastword ${subst -, ,${subst 2, ,${1}}}),${2})
|
||||
|
||||
|
||||
tatoeba-%-langs:
|
||||
-( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
|
||||
t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
|
||||
echo "${call find-srclanggroup,${patsubst tatoeba-%-langs,%,$@},${PIVOT}}"; \
|
||||
echo "${call find-trglanggroup,${patsubst tatoeba-%-langs,%,$@},${PIVOT}}"; )
|
||||
|
||||
|
||||
## create data sets (also works for language groups)
|
||||
tatoeba-%-data:
|
||||
@ -627,13 +634,25 @@ tatoeba-%-data:
|
||||
if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \
|
||||
if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \
|
||||
if [ `echo $$T | tr ' ' "\n" | wc -l` -le ${MAX_TRGLANGS} ]; then \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-prepare; \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="$$S" TRGLANGS="$$T" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
tatoeba-prepare; \
|
||||
fi \
|
||||
fi \
|
||||
fi \
|
||||
fi )
|
||||
|
||||
|
||||
## create data sets (also works for language groups)
|
||||
tatoeba-labeltest:
|
||||
@echo "${call find-trglanggroup,eng2roa,}"
|
||||
@echo "${call find-langgroup,roa,}"
|
||||
@echo "${shell langgroup roa | xargs iso639 -m -n}"
|
||||
@echo "$(filter ${OPUS_LANGS3},${shell langgroup roa | xargs iso639 -m -n})"
|
||||
|
||||
|
||||
## start the training job
|
||||
## - create config file
|
||||
## - create data sets
|
||||
@ -648,7 +667,10 @@ tatoeba-%-train:
|
||||
if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \
|
||||
if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \
|
||||
if [ `echo $$T | tr ' ' "\n" | wc -l` -le ${MAX_TRGLANGS} ]; then \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-job; \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
tatoeba-job; \
|
||||
fi \
|
||||
fi \
|
||||
fi \
|
||||
@ -670,6 +692,8 @@ tatoeba-%-eval:
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-eval,%,$@},${PIVOT}}" \
|
||||
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-eval,%,$@},${PIVOT}}" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
compare-tatoeba; \
|
||||
fi \
|
||||
fi )
|
||||
@ -683,8 +707,14 @@ tatoeba-%-multieval:
|
||||
T="${call find-trglanggroup,${patsubst tatoeba-%-multieval,%,$@},${PIVOT}}"; \
|
||||
if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \
|
||||
if [ `find ${TATOEBA_WORK}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-multilingual-eval; \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-sublang-eval; \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
tatoeba-multilingual-eval; \
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
tatoeba-sublang-eval; \
|
||||
fi \
|
||||
fi )
|
||||
|
||||
@ -697,6 +727,8 @@ tatoeba-%-eval-testsets:
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-eval-testsets,%,$@},${PIVOT}}" \
|
||||
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-eval-testsets,%,$@},${PIVOT}}" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
eval-testsets-tatoeba; \
|
||||
fi \
|
||||
fi )
|
||||
@ -710,6 +742,8 @@ tatoeba-%-testsets:
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-testsets,%,$@},${PIVOT}}" \
|
||||
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-testsets,%,$@},${PIVOT}}" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
tatoeba-multilingual-testsets; \
|
||||
fi \
|
||||
fi )
|
||||
@ -719,7 +753,7 @@ tatoeba-%-testsets:
|
||||
## - model specific test set
|
||||
## - other language-specific test sets
|
||||
## - individual language pairs for multilingual models
|
||||
tatoeba-%-evalall: tatoeba-%-eval-testsets tatoeba-%-multieval
|
||||
tatoeba-%-evalall: tatoeba-%-eval tatoeba-%-multieval tatoeba-%-eval-testsets
|
||||
@echo "Done!"
|
||||
|
||||
|
||||
@ -739,6 +773,8 @@ tatoeba-%-dist:
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-dist,%,$@},${PIVOT}}" \
|
||||
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-dist,%,$@},${PIVOT}}" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
release-tatoeba; \
|
||||
fi )
|
||||
|
||||
@ -759,6 +795,8 @@ tatoeba-%-refresh-release-readme:
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release-readme,%,$@},${PIVOT}}" \
|
||||
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release-readme,%,$@},${PIVOT}}" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
refresh-release-readme-tatoeba; \
|
||||
fi )
|
||||
|
||||
@ -781,6 +819,8 @@ tatoeba-%-refresh-release: tatoeba-%-refresh-release-yml tatoeba-%-refresh-relea
|
||||
${MAKE} LANGPAIRSTR=$$s-$$t \
|
||||
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release,%,$@},${PIVOT}}" \
|
||||
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release,%,$@},${PIVOT}}" \
|
||||
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
|
||||
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
|
||||
refresh-release-tatoeba; \
|
||||
fi )
|
||||
|
||||
|
@ -16,6 +16,45 @@ bcl-en:
|
||||
${MAKE} SRCLANGS="bcl" TRGLANGS="en" DEVSET=wikimedia all-job
|
||||
|
||||
|
||||
en-bcl-nt:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
|
||||
DATASET=${DATASET}+nt \
|
||||
EXTRA_TRAINSET="new-testament" \
|
||||
DEVSET=wikimedia all-job
|
||||
|
||||
bcl-en-nt:
|
||||
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
|
||||
DATASET=${DATASET}+nt \
|
||||
EXTRA_TRAINSET="new-testament" \
|
||||
DEVSET=wikimedia all-job
|
||||
|
||||
%-en-bcl:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
|
||||
|
||||
|
||||
%-bcl-en:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-bcl-en=}
|
||||
|
||||
|
||||
%-en-bcl-nt:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
|
||||
DATASET=${DATASET}+nt \
|
||||
EXTRA_TRAINSET="new-testament" \
|
||||
DEVSET=wikimedia \
|
||||
${@:-en-bcl-nt=}
|
||||
|
||||
%-bcl-en-nt:
|
||||
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
|
||||
DATASET=${DATASET}+nt \
|
||||
EXTRA_TRAINSET="new-testament" \
|
||||
DEVSET=wikimedia \
|
||||
${@:-bcl-en-nt=}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ENAS_BPE = 4000
|
||||
ENAS_BPE = 1000
|
||||
@ -32,7 +71,35 @@ ENBCL_BPE = 1000
|
||||
${@:-en-as=}
|
||||
|
||||
|
||||
%-en-bcl:
|
||||
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
|
||||
|
||||
|
||||
|
||||
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul
|
||||
|
||||
|
||||
## start jobs for all languages where we have back-translations
|
||||
|
||||
wiki-eng2all-with-bt:
|
||||
for l in ${WIKI_BT2ENG}; do \
|
||||
if [ -d work-tatoeba/$$l-eng ]; then \
|
||||
if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
|
||||
echo "fetch back-translations for $$l-eng"; \
|
||||
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
|
||||
echo "start training eng-$$l with backtranslation data"; \
|
||||
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
WIKI_BT2ENG_PARENTS = ${sort ${shell langgroup -p ${WIKI_BT2ENG}}}
|
||||
|
||||
wiki-eng2allgroups-with-bt:
|
||||
for l in $(filter-out roa,${WIKI_BT2ENG_PARENTS}); do \
|
||||
if [ -d work-tatoeba/eng-$$l ]; then \
|
||||
echo "mv work-tatoeba/eng-$$l work-tatoeba-old"; \
|
||||
mv work-tatoeba/eng-$$l work-tatoeba-old; \
|
||||
fi; \
|
||||
echo "start training eng-$$l with backtranslation data"; \
|
||||
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
|
||||
done
|
||||
|
||||
|
@ -28,6 +28,9 @@ SPMEXTRA =
|
||||
## set to 1 if you want to generate SPM vocab file
|
||||
GENERATE_SPM_VOC = 0
|
||||
|
||||
# SPM_INPUT_SIZE = 10000000
|
||||
SPM_INPUT_SIZE = 2000000
|
||||
SPM_SHUFFLE_INPUT = 0
|
||||
|
||||
## we keep the dependency on LOCAL_TRAIN_SRC
|
||||
## to make multi-threaded make calls behave properly
|
||||
@ -52,10 +55,12 @@ endif
|
||||
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
${SPM_TRAIN} ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=${LOCAL_TRAIN_SRC}.text \
|
||||
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
|
||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||
else \
|
||||
${SPM_TRAIN} ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=${LOCAL_TRAIN_SRC}.text \
|
||||
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
mv $@.model $@
|
||||
@ -82,10 +87,12 @@ else
|
||||
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
${SPM_TRAIN} ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=${LOCAL_TRAIN_TRG}.text \
|
||||
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
|
||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||
else \
|
||||
${SPM_TRAIN} ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=${LOCAL_TRAIN_TRG}.text \
|
||||
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
mv $@.model $@
|
||||
@ -161,10 +168,12 @@ ifeq ($(wildcard ${SPMMODEL}),)
|
||||
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
${SPM_TRAIN} ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
|
||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||
else \
|
||||
${SPM_TRAIN} ${SPMEXTRA} \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false; \
|
||||
fi
|
||||
mv $@.model $@
|
||||
|
10
lib/train.mk
10
lib/train.mk
@ -12,6 +12,9 @@ ifeq (${SUBWORDS},spm)
|
||||
## sentence piece models (concatenate and yamlify)
|
||||
|
||||
${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
ifneq (${MODEL_LATEST_VOCAB},)
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
else
|
||||
cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab}
|
||||
cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab}
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
@ -23,6 +26,8 @@ endif
|
||||
cut -f2 $@.numbered | sed 's/\\/\\\\/g;s/\"/\\\"/g;s/^\(.*\)$$/"\1"/;s/$$/:/'> $@.tokens
|
||||
paste -d ' ' $@.tokens $@.ids > $@
|
||||
rm -f $@.tokens $@.ids $@.numbered
|
||||
endif
|
||||
|
||||
|
||||
else
|
||||
|
||||
@ -173,10 +178,11 @@ endif
|
||||
--transformer-dropout ${MARIAN_DROPOUT} \
|
||||
--label-smoothing 0.1 \
|
||||
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
||||
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
||||
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 --fp16 \
|
||||
${MARIAN_TIE_EMBEDDINGS} \
|
||||
--devices ${MARIAN_GPUS} \
|
||||
--sync-sgd --seed ${SEED} \
|
||||
--sync-sgd \
|
||||
--seed ${SEED} \
|
||||
--sqlite \
|
||||
--tempdir ${TMPDIR} \
|
||||
--exponential-smoothing
|
||||
|
@ -1,17 +1,3 @@
|
||||
# wikimedia-2020-01-17.zip
|
||||
|
||||
* dataset: wikimedia
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [wikimedia-2020-01-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/wikimedia-2020-01-17.zip)
|
||||
* test set translations: [wikimedia-2020-01-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/wikimedia-2020-01-17.test.txt)
|
||||
* test set scores: [wikimedia-2020-01-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/wikimedia-2020-01-17.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.bcl.en | 56.8 | 0.705 |
|
||||
|
||||
# opus-2020-01-20.zip
|
||||
|
||||
@ -28,6 +14,7 @@
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.bcl.en | 56.8 | 0.705 |
|
||||
|
||||
|
||||
# opus-2020-02-11.zip
|
||||
|
||||
* dataset: opus
|
||||
@ -43,6 +30,7 @@
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.bcl.en | 56.1 | 0.697 |
|
||||
|
||||
|
||||
# opus+bt-2020-05-23.zip
|
||||
|
||||
* dataset: opus+bt
|
||||
@ -69,3 +57,38 @@
|
||||
|-----------------------|-------|-------|
|
||||
| JW300.bcl.en | 57.6 | 0.712 |
|
||||
|
||||
|
||||
# opus+nt-2021-03-29.zip
|
||||
|
||||
* dataset: opus+nt
|
||||
* model: transformer-align
|
||||
* source language(s): bcl
|
||||
* target language(s): en
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
|
||||
* download: [opus+nt-2021-03-29.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt-2021-03-29.zip)
|
||||
## Training data: opus+nt
|
||||
|
||||
* bcl-en: JW300 (470468) new-testament (11623)
|
||||
* bcl-en: total size = 482091
|
||||
* total size (opus+nt): 482047
|
||||
|
||||
|
||||
## Validation data
|
||||
|
||||
* bcl-en: wikimedia, 1153
|
||||
* total-size-shuffled: 775
|
||||
|
||||
* devset-selected: top 250 lines of wikimedia.src.shuffled!
|
||||
* testset-selected: next 525 lines of wikimedia.src.shuffled!
|
||||
* devset-unused: added to traindata
|
||||
|
||||
* test set translations: [opus+nt-2021-03-29.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt-2021-03-29.test.txt)
|
||||
* test set scores: [opus+nt-2021-03-29.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt-2021-03-29.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F | #sent | #words | BP |
|
||||
|---------|-------|-------|-------|--------|----|
|
||||
| wikimedia.bcl-en | 10.4 | 0.320 | 525 | 27109 | 0.477 |
|
||||
|
||||
|
@ -16,7 +16,10 @@ my $file = shift(@ARGV);
|
||||
my $count=0;
|
||||
my $repeated=0;
|
||||
while ($count < $size){
|
||||
open F,"<$file" || die "cannot read from $file!\n";
|
||||
if ($file=~/\.gz$/){
|
||||
open F,"gzip -cd <$file |" || die "cannot open $file";
|
||||
}
|
||||
else{ open F,"<$file" || die "cannot read from $file!\n"; }
|
||||
while (<F>){
|
||||
$count++;
|
||||
print;
|
||||
|
@ -12,12 +12,7 @@
|
||||
# to avoid removing newline characters!
|
||||
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||||
fi
|
||||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||||
|
||||
|
||||
if [ "$4" == "noflags" ]; then
|
||||
@ -97,7 +92,7 @@ else
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;'
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
|
@ -12,12 +12,7 @@
|
||||
# to avoid removing newline characters!
|
||||
#
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||||
fi
|
||||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||||
|
||||
## simple pre-processing steps adapted from Moses tools
|
||||
|
||||
@ -57,7 +52,7 @@ sed -e 's/,/,/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;'
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user