balance dev data in multiligual models and a bug fixed in preprocess script

This commit is contained in:
Joerg Tiedemann 2021-03-30 00:00:28 +03:00
parent 3cd0bd3f75
commit cde8f0d0af
13 changed files with 438 additions and 157 deletions

View File

@ -27,13 +27,6 @@ ifdef LANGPAIRS
endif
## final default is sv-fi
## NEW: don't do this ... can create confusion ...
##
# SRCLANGS ?= sv
# TRGLANGS ?= fi
## set SRC and TRG unless they are specified already
ifneq (${words ${SRCLANGS}},1)
SRC ?= multi
@ -54,43 +47,66 @@ endif
# TRG ?= ${lastword ${TRGLANGS}}
##----------------------------------------------------------------------
## SKIP_LANGPAIRS can be used to skip certain language pairs
## in data preparation for multilingual models
## ---> this can be good to skip BIG language pairs
## that would very much dominate all the data
## must be a pattern that can be matched by egrep
## e.g. en-de|en-fr
##----------------------------------------------------------------------
SKIP_LANGPAIRS ?= "nothing"
##----------------------------------------------------------------------
## set SHUFFLE_DATA if you want to shuffle data for
## each language pair to be added to the training data
## --> especially useful in connection with FIT_DATA_SIZE
##
##----------------------------------------------------------------------
# SHUFFLE_DATA = 1
## devtest data is shuffled by default
SHUFFLE_DEVDATA = 1
##----------------------------------------------------------------------
## set FIT_DATA_SIZE to a specific value to fit the training data
## to a certain number of lines for each language pair in the collection
## --> especially useful for multilingual models for balancing the
## the size for each language pair
## the script does both, over- and undersampling
##
##----------------------------------------------------------------------
# FIT_DATA_SIZE = 100000
## similar for the dev data: set FIT_DEVDATA_SIZE to
## balance the size of the devdata for each language pair
##
# FIT_DEVDATA_SIZE =
## define a default dev size fit for multilingual models
## TODO: is 1000 too small? or too big?
## TODO: should this depend on the number of languages involved?
ifneq (${words ${TRGLANGS}},1)
FIT_DEVDATA_SIZE ?= 1000
endif
ifneq (${words ${SRCLANGS}},1)
FIT_DEVDATA_SIZE ?= 1000
endif
## maximum number of repeating the same data set
## in oversampling
MAX_OVER_SAMPLING ?= 50
##----------------------------------------------------------------------
## set CHECK_TRAINDATA_SIZE if you want to check that each
## bitext has equal number of lines in source and target
## ---> this only prints a warning if not
##
##----------------------------------------------------------------------
# CHECK_TRAINDATA_SIZE
@ -134,11 +150,10 @@ endif
DEVSIZE = 2500
TESTSIZE = 2500
## NEW: significantly reduce devminsize
## (= absolute minimum we need as devdata)
## NEW: define an alternative small size for DEV and TEST
## OLD DEVMINSIZE:
# DEVMINSIZE = 1000
## set some additional thresholds for
## the size of test and dev data
## DEVMINSIZE is the absolute minimum we require
## to run any training procedures
DEVSMALLSIZE = 1000
TESTSMALLSIZE = 1000
@ -154,24 +169,8 @@ OPUSREAD_ARGS =
## resources in OPUS
##----------------------------------------------------------------------------
## OLD: get corpora directly from the file system
#
# ELRA_CORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
# ${patsubst ${OPUSHOME}/%,%,\
# ${shell ls ${OPUSHOME}/ELRA-*/latest/xml/${LANGPAIR}.xml.gz 2>/dev/null}}}
#
# EXCLUDE_CORPORA ?= WMT-News MPC1 ${ELRA_CORPORA}
#
# OPUSCORPORA = $(filter-out ${EXCLUDE_CORPORA},${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
# ${patsubst ${OPUSHOME}/%,%,\
# ${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz 2>/dev/null}}})
#
# OPUSMONOCORPORA = $(filter-out ${EXCLUDE_CORPORA} ,${patsubst %/latest/mono/${LANGID}.txt.gz,%,\
# ${patsubst ${OPUSHOME}/%,%,\
# ${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}})
## NEW: get data from the OPUS-API
## get available data from the OPUS-API
OPUSAPI = http://opus.nlpl.eu/opusapi/
@ -185,7 +184,6 @@ get-elra-bitexts = ${shell wget -qq -O - ${OPUSAPI}?source=${1}\&target=${2}\&
${JQ} '.corpora[]' | tr '"' ' ' | grep '^ *ELR[CA][-_]'}
## start of some functions to check whether there is a resource for downloading
## open question: links to the latest release do not exist in the storage
## --> would it be better to get that done via the OPUS API?
@ -362,8 +360,8 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
MODEL_SUBDIR =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
# MODELTYPE = transformer-align
MODELTYPE = transformer
MODELTYPE = transformer-align
# MODELTYPE = transformer
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
@ -597,6 +595,9 @@ ifdef SHUFFLE_DATA
endif
ifdef FIT_DATA_SIZE
echo "FIT_DATA_SIZE = ${FIT_DATA_SIZE}" >> $@
endif
ifdef FIT_DEVDATA_SIZE
echo "FIT_DEVDATA_SIZE = ${FIT_DEVDATA_SIZE}" >> $@
endif
echo "MAX_OVER_SAMPLING = ${MAX_OVER_SAMPLING}" >> $@
echo "USE_REST_DEVDATA = ${USE_REST_DEVDATA}" >> $@

View File

@ -9,7 +9,10 @@
# - shuffle dev/test data and divide into to disjoint sets
# - reverse data sets for the other translation direction (bilingual models only)
# - run word alignment if necessary (models with guided alignment = transformer-align)
#
#
# TODO: write data info to some model-specific file insetad of README.md
# (applies for train/val/test!)
## training data size (generates count if not in README.md)
@ -348,6 +351,14 @@ endif
endif
## add language labels to the source language
## if we have multiple target languages
ifeq (${USE_TARGET_LABELS},1)
LABEL_SOURCE_DATA = | sed "s/^/>>${TRG}<< /"
endif
## add to the training data
add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
@ -377,17 +388,10 @@ endif
done
echo "" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
######################################
# do we need to add target language labels?
# create local data files (add label if necessary)
######################################
ifeq (${USE_TARGET_LABELS},1)
@echo "set target language labels";
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} 2>/dev/null |\
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
else
@echo "only one target language"
@${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} 2>/dev/null \
> ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
endif
${LABEL_SOURCE_DATA} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
@${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} 2>/dev/null \
> ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
######################################
@ -442,6 +446,10 @@ show-devdata:
raw-devdata: ${DEV_SRC} ${DEV_TRG}
## TODO: should we have some kind of balanced shuffling
## to avoid bias towards bigger language pairs?
## maybe introduce over/undersampling of dev data like we have for train data?
${DEV_SRC}.shuffled.gz:
mkdir -p ${dir $@}
rm -f ${DEV_SRC} ${DEV_TRG}
@ -461,13 +469,13 @@ ifeq (${SHUFFLE_DEVDATA},0)
else
paste ${DEV_SRC} ${DEV_TRG} | ${UNIQ} | ${SHUFFLE} | ${GZIP} -c > $@
endif
echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md
${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md
echo -n "* total-size-shuffled: " >> ${dir ${DEV_SRC}}README.md
${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md
## OLD: don't uniq the dev-data ...
##
# paste ${DEV_SRC} ${DEV_TRG} | ${SHUFFLE} | ${GZIP} -c > $@
# echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md
## if we have less than twice the amount of DEVMINSIZE in the data set
@ -513,17 +521,17 @@ else
@${GZIP} -cd < $< | cut -f2 | tail -n +$$((${DEVSIZE} + 1)) | ${GZIP} -c > ${DEV_TRG}.notused.gz
endif
@echo "" >> ${dir ${DEV_SRC}}/README.md
@echo -n "* devset = top " >> ${dir ${DEV_SRC}}/README.md
@echo -n "* devset-selected: top " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${DEV_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
ifeq (${DEVSET},${TESTSET})
@echo -n "* testset = next " >> ${dir ${DEV_SRC}}/README.md
@echo -n "* testset-selected: next " >> ${dir ${DEV_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${DEV_SRC}}/README.md
@echo " lines of ${notdir $@}.shuffled!" >> ${dir ${DEV_SRC}}/README.md
@echo "* remaining lines are added to traindata" >> ${dir ${DEV_SRC}}/README.md
@echo "* devset-unused: added to traindata" >> ${dir ${DEV_SRC}}/README.md
@echo "# Test data" > ${dir ${TEST_SRC}}/README.md
@echo "" >> ${dir ${TEST_SRC}}/README.md
@echo -n "testset = next " >> ${dir ${TEST_SRC}}/README.md
@echo -n "testset-selected: next " >> ${dir ${TEST_SRC}}/README.md
@wc -l < ${TEST_SRC} | tr "\n" ' ' >> ${dir ${TEST_SRC}}/README.md
@echo " lines of ../val/${notdir $@}.shuffled!" >> ${dir ${TEST_SRC}}/README.md
endif
@ -538,15 +546,20 @@ add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
@mkdir -p ${dir ${DEV_SRC}}
@echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null | wc -l >> ${dir ${DEV_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
@echo "more than one target language";
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null |\
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
#-----------------------------------------------------------------
# sample devdata to balance size between different language pairs
# (only if FIT_DEVDATA_SIZE is set)
#-----------------------------------------------------------------
ifdef FIT_DEVDATA_SIZE
@echo "sample dev data to fit size = ${FIT_DEVDATA_SIZE}"
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DEVDATA_SIZE} \
${CLEAN_DEV_SRC} 2>/dev/null ${LABEL_SOURCE_DATA} >> ${DEV_SRC}
@scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DEVDATA_SIZE} \
${CLEAN_DEV_TRG} 2>/dev/null >> ${DEV_TRG}
else
@echo "only one target language"
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null >> ${DEV_SRC}
@${ZCAT} ${CLEAN_DEV_SRC} 2>/dev/null ${LABEL_SOURCE_DATA} >> ${DEV_SRC}
@${ZCAT} ${CLEAN_DEV_TRG} 2>/dev/null >> ${DEV_TRG}
endif
@${ZCAT} ${CLEAN_DEV_TRG} 2>/dev/null >> ${DEV_TRG}
####################
@ -583,8 +596,8 @@ ifneq (${TESTSET},${DEVSET})
paste ${TEST_SRC} ${TEST_TRG} | ${SHUFFLE} | ${GZIP} -c > $@.shuffled.gz; \
${GZIP} -cd < $@.shuffled.gz | cut -f1 | tail -${TESTSIZE} > ${TEST_SRC}; \
${GZIP} -cd < $@.shuffled.gz | cut -f2 | tail -${TESTSIZE} > ${TEST_TRG}; \
echo "" >> ${dir $@}/README.md; \
echo "testset = top ${TESTSIZE} lines of $@.shuffled!" >> ${dir $@}/README.md; \
echo "" >> ${dir $@}/README.md; \
echo "testset-selected: top ${TESTSIZE} lines of $@.shuffled!" >> ${dir $@}/README.md; \
fi \
else \
echo "test set $@ exists already! Don't overwrite!"; \
@ -611,18 +624,8 @@ ${TEST_TRG}: ${TEST_SRC}
add-to-test-data: ${CLEAN_TEST_SRC}
@echo "add to testset: ${CLEAN_TEST_SRC}"
@echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
@echo "more than one target language";
@echo "${ZCAT} ${CLEAN_TEST_SRC} | sed 's/^/>>${TRG}<< /' >> ${TEST_SRC}"
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null |\
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
else
@echo "only one target language"
@echo "${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}"
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null >> ${TEST_SRC}
endif
@echo "${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}"
@${ZCAT} ${CLEAN_TEST_TRG} 2>/dev/null >> ${TEST_TRG}
@${ZCAT} ${CLEAN_TEST_SRC} 2>/dev/null ${LABEL_SOURCE_DATA} >> ${TEST_SRC}
@${ZCAT} ${CLEAN_TEST_TRG} 2>/dev/null >> ${TEST_TRG}

View File

@ -200,11 +200,15 @@ RAWSRCLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${SRCLANGS}}}
RAWTRGLANGS = ${sort ${basename ${basename ${subst _,.,${subst -,.,${TRGLANGS}}}}}}
## language labels in multilingual models
## (NEW: take them directly from the model vocabulary
## tto avoid listing labels that are not used)
# LANGUAGELABELS = ${patsubst %,>>%<<,${TRGLANGS}}
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
## BETTER: take them directly from the model vocabulary!
## advantage: list all labels that are valid in the model
## disadvantage: can be misleading because we may have labels that are not trained
##
LANGUAGELABELS = ${shell grep '">>.*<<"' ${MODEL_SRCVOCAB} | cut -f1 -d: | sed 's/"//g'}
LANGUAGELABELSRAW = ${shell echo "${LANGUAGELABELS}" | sed 's/>>//g;s/<<//g'}
model-yml: ${MODEL_YML}
@ -244,18 +248,24 @@ ${MODEL_YML}: ${MODEL_FINAL}
@echo "release-date: $(DATE)" >> $@
@echo "dataset-name: $(DATASET)" >> $@
@echo "modeltype: $(MODELTYPE)" >> $@
@echo "vocabulary:" >> $@
@echo " source: ${notdir ${MODEL_SRCVOCAB}}" >> $@
@echo " target: ${notdir ${MODEL_TRGVOCAB}}" >> $@
@echo "pre-processing: ${PREPROCESS_DESCRIPTION}" >> $@
@echo "subwords:" >> $@
@echo " - source: ${PRE_SRC}" >> $@
@echo " - target: ${PRE_TRG}" >> $@
@echo " source: ${PRE_SRC}" >> $@
@echo " target: ${PRE_TRG}" >> $@
@echo "subword-models:" >> $@
@echo " - source: source.${SUBWORD_TYPE}" >> $@
@echo " - target: target.${SUBWORD_TYPE}" >> $@
@echo " source: source.${SUBWORD_TYPE}" >> $@
@echo " target: target.${SUBWORD_TYPE}" >> $@
ifdef USE_TARGET_LABELS
@echo "use-target-labels:" >> $@
@for t in ${TRGLANGS}; do \
echo " - >>$$t<<" >> $@; \
@for t in ${LANGUAGELABELSRAW}; do \
echo " - \">>$$t<<\"" >> $@; \
done
# @for t in ${TRGLANGS}; do \
# echo " - '>>$$t<<'" >> $@; \
# done
endif
@echo "source-languages:" >> $@
@for s in ${RAWSRCLANGS}; do\
@ -271,13 +281,13 @@ ifneq ("$(wildcard ${WORKDIR}/train/README.md)","")
tr "#" "\n" | grep '^ ${DATASET}~' | \
tail -1 | tr "~" "\n" | grep '^\* ' | \
grep -v ': *$$' | grep -v ' 0$$' | \
grep -v 'total size' | sed 's/^\* / - /' >> $@
grep -v 'total size' | sed 's/^\* / /' >> $@
endif
ifneq ("$(wildcard ${WORKDIR}/val/README.md)","")
@echo "validation-data:" >> $@
grep '^\* ' ${WORKDIR}/val/README.md | \
grep -v ' 0$$' | \
sed 's/^\* / - /' >> $@
sed 's/^\* / /' >> $@
endif
##-----------------------------
## add benchmark results
@ -301,11 +311,11 @@ ifneq ("$(wildcard ${TEST_EVALUATION})","")
cut -f7 -d ' ' > $@.6
@paste -d '/' $@.4 $@.5 > $@.7
@echo "test-data:" >> $@
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ - /;' >> $@
@paste -d' ' $@.1 $@.7 | sed 's/ /: /;s/^/ /;' >> $@
@echo "BLEU-scores:" >> $@
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ - /' >> $@
@paste -d' ' $@.1 $@.2 | sed 's/ /: /;s/^/ /' >> $@
@echo "chr-F-scores:" >> $@
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ - /' >> $@
@paste -d' ' $@.1 $@.3 | sed 's/ /: /;s/^/ /' >> $@
@rm -f $@.1 $@.2 $@.3 $@.4 $@.5 $@.6 $@.7
endif
@ -785,7 +795,7 @@ ${EVALTRANSL}: # ${WORKHOME}/eval/%.test.txt: ${MODELSHOME}/%.compare
######################################################################
## handle old models in previous work directories
## misc recipes ... all kind of fixes
## obsolete now?
######################################################################
@ -948,3 +958,38 @@ remove-underperforming:
echo "keep $$d"; \
fi \
done
dist-remove-no-date-dist:
swift list ${MODEL_CONTAINER} > index.txt
for d in `grep opus.zip index.txt`; do \
swift delete ${MODEL_CONTAINER} $$d; \
done
dist-remove-old-yml:
swift list Tatoeba-MT-models > index.txt
for d in `grep old-yml index.txt`; do \
swift delete Tatoeba-MT-models $$d; \
done
dist-fix-preprocess:
mkdir -p tmp
( cd tmp; \
swift list Tatoeba-MT-models > index.txt; \
for d in `grep '.zip' index.txt`; do \
echo "check $$d ..."; \
swift download Tatoeba-MT-models $$d; \
unzip $$d preprocess.sh; \
mv preprocess.sh preprocess-old.sh; \
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
< preprocess-old.sh > preprocess.sh; \
chmod +x ${dir $@}/preprocess.sh; \
if [ `diff preprocess-old.sh preprocess.sh | wc -l` -gt 0 ]; then \
echo "replace old preprocess in $$d and upload again"; \
zip -u $$d preprocess.sh; \
swift upload Tatoeba-MT-models --changed --skip-identical $$d; \
fi; \
rm -f preprocess.sh; \
rm -f $$d; \
done )

View File

@ -129,8 +129,8 @@ else ifeq (${shell hostname --domain 2>/dev/null},bullx)
MOSESHOME = ${APPLHOME}/mosesdecoder
MOSESSCRIPTS = ${MOSESHOME}/scripts
EFLOMAL_HOME = ${APPLHOME}/eflomal/
MARIAN_HOME = ${APPLHOME}/marian-dev/build/
MARIAN = ${APPLHOME}/marian-dev/build
MARIAN_HOME = ${APPLHOME}/marian/build/
MARIAN = ${APPLHOME}/marian/build
SPM_HOME = ${MARIAN_HOME}
GPU = v100
GPU_MODULES = python-env

View File

@ -21,8 +21,9 @@ AMERICASNLP_TRGALL = $(sort ${AMERICASNLP_TRG} ${AMERICASNLP_TRG_EXTRA} \
# AMERICASNLP_TRGALL = ${sort ${AMERICASNLP_TRG} ${AMERICASNLP_TRG_EXTRA} ${LANGGROUP_NAI} ${LANGGROUP_SAI}}
AMERICASNLP_BPESIZE = 32000
AMERICASNLP_FIT_DATA_SIZE = 100000
AMERICASNLP_PIVOT = en
AMERICASNLP_PIVOT = en
# /scratch/project_2001194/yves/americas/backtrans/merged/*.dedup.*
@ -78,6 +79,27 @@ americasnlp-testdata:
done
AMERICASNLP_YVES_SPLITDIR = /scratch/project_2001194/yves/americas/backtrans/split_dev
americasnlp-finetunedata:
for l in aym bzd cni gn hch nah oto quy shp tar; do \
if [ $$l \< es ]; then p="$$l-es"; else p="es-$$l"; fi; \
head -50 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.$$l |\
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunedev.$$p.clean.$$l.gz; \
head -50 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.es |\
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunedev.$$p.clean.es.gz; \
tail -n +51 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.$$l |\
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetrain.$$p.clean.$$l.gz; \
tail -n +51 ${AMERICASNLP_YVES_SPLITDIR}/dev1_$$l.es |\
gzip -c > ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetrain.$$p.clean.es.gz; \
gzip -c < ${AMERICASNLP_YVES_SPLITDIR}/dev2_$$l.$$l \
> ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetest.$$p.clean.$$l.gz; \
gzip -c < ${AMERICASNLP_YVES_SPLITDIR}/dev2_$$l.es \
> ${AMERICASNLP_WORK}/data/simple/americasnlp2021-tunetest.$$p.clean.es.gz; \
done
AMERICASNLP_YVES_BTDIR = /scratch/project_2001194/yves/americas/backtrans/merged
americasnlp-btdata:
@ -97,9 +119,9 @@ americasnlp-btdata:
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=32000 \
TRGBPESIZE=32000 \
BPESIZE=32000 \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
MARIAN_VALID_FREQ=2500 \
DATASET=americasnlp \
@ -123,29 +145,101 @@ americasnlp-btdata:
MARIAN_EARLY_STOPPING=${TUNE_EARLY_STOPPING} \
MARIAN_EXTRA='-e 5 --no-restore-corpus' \
GPUJOB_SUBMIT=${TUNE_GPUJOB_SUBMIT} \
DATASET=americasnlp-tuned4${TUNE_SRC}2${TUNE_TRG} \
DATASET=opus-americasnlp-tuned4${TUNE_SRC}2${TUNE_TRG} \
SRCLANGS="${TUNE_SRC}" \
TRGLANGS="${TUNE_TRG}" \
USE_TARGET_LABELS=1 \
USE_REST_DEVDATA=0 \
WORKHOME=${AMERICASNLP_WORK} \
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=32000 \
TRGBPESIZE=32000 \
BPESIZE=32000 \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
TRAINSET=americasnlp2021-train \
EXTRA_TRAINSET=americasnlp2021-train \
DEVSET=americasnlp2021-dev \
TESTSET=americasnlp2021-test \
DEVSET_NAME=americasnlp2021-dev \
TESTSET_NAME=americasnlp2021-test \
FIT_DATA_SIZE=${AMERICASNLP_FIT_DATA_SIZE} \
SHUFFLE_DATA=1 \
LANGPAIRSTR="es-xx" \
TRAINSET=americasnlp2021-tunetrain \
EXTRA_TRAINSET=americasnlp2021-tunetrain \
DEVSET=americasnlp2021-tunedev \
TESTSET=americasnlp2021-tunetest \
DEVSET_NAME=americasnlp2021-tunedev \
TESTSET_NAME=americasnlp2021-tunetest \
LANGPAIRSTR="es+en-xx" \
${@:-americasnlp-langtune=}
## tune for all languages
%-americasnlp-tune:
make MODEL_LATEST=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp.spm32k-spm32k.transformer-align.model1.npz.best-perplexity.npz \
MODEL_LATEST_VOCAB=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp.spm32k-spm32k.vocab.yml \
MARIAN_VALID_FREQ=${TUNE_VALID_FREQ} \
MARIAN_DISP_FREQ=${TUNE_DISP_FREQ} \
MARIAN_SAVE_FREQ=${TUNE_SAVE_FREQ} \
MARIAN_EARLY_STOPPING=${TUNE_EARLY_STOPPING} \
MARIAN_EXTRA='-e 5 --no-restore-corpus' \
GPUJOB_SUBMIT=${TUNE_GPUJOB_SUBMIT} \
DATASET=opus-americasnlp-tuned4es2all \
SRCLANGS="es" \
TRGLANGS="aym bzd cni gn hch nah oto quy shp tar" \
USE_TARGET_LABELS=1 \
USE_REST_DEVDATA=0 \
WORKHOME=${AMERICASNLP_WORK} \
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
TRAINSET=americasnlp2021-tunetrain \
EXTRA_TRAINSET=americasnlp2021-tunetrain \
DEVSET=americasnlp2021-tunedev \
TESTSET=americasnlp2021-tunetest \
DEVSET_NAME=americasnlp2021-tunedev \
TESTSET_NAME=americasnlp2021-tunetest \
LANGPAIRSTR="es+en-xx" \
${@:-americasnlp-tune=}
%-americasnlp-tunebt:
make MODEL_LATEST=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp+bt.spm32k-spm32k.transformer-align.model1.npz.best-perplexity.npz \
MODEL_LATEST_VOCAB=${AMERICASNLP_WORK}/es+en-xx/opus-americasnlp+bt.spm32k-spm32k.vocab.yml \
MARIAN_VALID_FREQ=${TUNE_VALID_FREQ} \
MARIAN_DISP_FREQ=${TUNE_DISP_FREQ} \
MARIAN_SAVE_FREQ=${TUNE_SAVE_FREQ} \
MARIAN_EARLY_STOPPING=${TUNE_EARLY_STOPPING} \
MARIAN_EXTRA='-e 5 --no-restore-corpus' \
GPUJOB_SUBMIT=${TUNE_GPUJOB_SUBMIT} \
DATASET=opus-americasnlp+bt-tuned4es2all \
SRCLANGS="es" \
TRGLANGS="aym bzd cni gn hch nah oto quy shp tar" \
USE_TARGET_LABELS=1 \
USE_REST_DEVDATA=0 \
WORKHOME=${AMERICASNLP_WORK} \
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
TRAINSET=americasnlp2021-tunetrain \
EXTRA_TRAINSET=americasnlp2021-tunetrain \
DEVSET=americasnlp2021-tunedev \
TESTSET=americasnlp2021-tunetest \
DEVSET_NAME=americasnlp2021-tunedev \
TESTSET_NAME=americasnlp2021-tunetest \
LANGPAIRSTR="es+en-xx" \
${@:-americasnlp-tunebt=}
%-americasnlp-reverse:
make TRGLANGS="${AMERICASNLP_SRC}" \
@ -154,9 +248,9 @@ americasnlp-btdata:
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=32000 \
TRGBPESIZE=32000 \
BPESIZE=32000 \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
MARIAN_VALID_FREQ=2500 \
DATASET=americasnlp \
@ -179,9 +273,9 @@ americasnlp-btdata:
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=32000 \
TRGBPESIZE=32000 \
BPESIZE=32000 \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
DATASET=opus-americasnlp \
EXTRA_TRAINSET=americasnlp2021-train \
@ -201,9 +295,9 @@ americasnlp-btdata:
BACKTRANS_HOME=${AMERICASNLP_BTHOME} \
TESTSET_HOME=${AMERICASNLP_TESTSETS} \
MODELTYPE=transformer-align \
SRCBPESIZE=32000 \
TRGBPESIZE=32000 \
BPESIZE=32000 \
SRCBPESIZE=${AMERICASNLP_BPESIZE} \
TRGBPESIZE=${AMERICASNLP_BPESIZE} \
BPESIZE=${AMERICASNLP_BPESIZE} \
GPUJOB_HPC_MEM=8g \
DATASET=opus-americasnlp \
EXTRA_TRAINSET=americasnlp2021-train \

View File

@ -134,10 +134,11 @@ endif
ifneq (${wildcard ${TATOEBA_TRGLABELFILE}},)
TATOEBA_TRGLANGS = ${shell cat ${TATOEBA_TRGLABELFILE}}
endif
ifndef USE_TARGET_LABELS
# ifndef USE_TARGET_LABELS
ifdef TATOEBA_TRGLANGS
ifneq (${words ${TATOEBA_TRGLANGS}},1)
USE_TARGET_LABELS = 1
TARGET_LABELS = $(patsubst %,>>%<<,${TATOEBA_TRGLANGS})
TARGET_LABELS = $(patsubst %,>>%<<,$(sort ${TATOEBA_TRGLANGS} ${TATOEBA_TRGLANG_GROUP}))
endif
endif
@ -616,6 +617,12 @@ find-srclanggroup = $(call find-langgroup,$(firstword ${subst -, ,${subst 2, ,${
find-trglanggroup = $(call find-langgroup,$(lastword ${subst -, ,${subst 2, ,${1}}}),${2})
tatoeba-%-langs:
-( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
echo "${call find-srclanggroup,${patsubst tatoeba-%-langs,%,$@},${PIVOT}}"; \
echo "${call find-trglanggroup,${patsubst tatoeba-%-langs,%,$@},${PIVOT}}"; )
## create data sets (also works for language groups)
tatoeba-%-data:
@ -627,13 +634,25 @@ tatoeba-%-data:
if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \
if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \
if [ `echo $$T | tr ' ' "\n" | wc -l` -le ${MAX_TRGLANGS} ]; then \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-prepare; \
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-prepare; \
fi \
fi \
fi \
fi )
## create data sets (also works for language groups)
tatoeba-labeltest:
@echo "${call find-trglanggroup,eng2roa,}"
@echo "${call find-langgroup,roa,}"
@echo "${shell langgroup roa | xargs iso639 -m -n}"
@echo "$(filter ${OPUS_LANGS3},${shell langgroup roa | xargs iso639 -m -n})"
## start the training job
## - create config file
## - create data sets
@ -648,7 +667,10 @@ tatoeba-%-train:
if [ `echo $$T | tr ' ' "\n" | wc -l` -ge ${MIN_TRGLANGS} ]; then \
if [ `echo $$S | tr ' ' "\n" | wc -l` -le ${MAX_SRCLANGS} ]; then \
if [ `echo $$T | tr ' ' "\n" | wc -l` -le ${MAX_TRGLANGS} ]; then \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-job; \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-job; \
fi \
fi \
fi \
@ -670,6 +692,8 @@ tatoeba-%-eval:
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-eval,%,$@},${PIVOT}}" \
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-eval,%,$@},${PIVOT}}" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
compare-tatoeba; \
fi \
fi )
@ -683,8 +707,14 @@ tatoeba-%-multieval:
T="${call find-trglanggroup,${patsubst tatoeba-%-multieval,%,$@},${PIVOT}}"; \
if [ -e ${TATOEBA_WORK}/$$s-$$t ]; then \
if [ `find ${TATOEBA_WORK}/$$s-$$t/ -name '*.npz' | wc -l` -gt 0 ]; then \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-multilingual-eval; \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" tatoeba-sublang-eval; \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-multilingual-eval; \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-sublang-eval; \
fi \
fi )
@ -697,6 +727,8 @@ tatoeba-%-eval-testsets:
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-eval-testsets,%,$@},${PIVOT}}" \
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-eval-testsets,%,$@},${PIVOT}}" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
eval-testsets-tatoeba; \
fi \
fi )
@ -710,6 +742,8 @@ tatoeba-%-testsets:
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-testsets,%,$@},${PIVOT}}" \
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-testsets,%,$@},${PIVOT}}" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-multilingual-testsets; \
fi \
fi )
@ -719,7 +753,7 @@ tatoeba-%-testsets:
## - model specific test set
## - other language-specific test sets
## - individual language pairs for multilingual models
tatoeba-%-evalall: tatoeba-%-eval-testsets tatoeba-%-multieval
tatoeba-%-evalall: tatoeba-%-eval tatoeba-%-multieval tatoeba-%-eval-testsets
@echo "Done!"
@ -739,6 +773,8 @@ tatoeba-%-dist:
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-dist,%,$@},${PIVOT}}" \
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-dist,%,$@},${PIVOT}}" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
release-tatoeba; \
fi )
@ -759,6 +795,8 @@ tatoeba-%-refresh-release-readme:
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release-readme,%,$@},${PIVOT}}" \
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release-readme,%,$@},${PIVOT}}" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
refresh-release-readme-tatoeba; \
fi )
@ -781,6 +819,8 @@ tatoeba-%-refresh-release: tatoeba-%-refresh-release-yml tatoeba-%-refresh-relea
${MAKE} LANGPAIRSTR=$$s-$$t \
SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-refresh-release,%,$@},${PIVOT}}" \
TRGLANGS="${call find-trglanggroup,${patsubst tatoeba-%-refresh-release,%,$@},${PIVOT}}" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
refresh-release-tatoeba; \
fi )

View File

@ -16,6 +16,45 @@ bcl-en:
${MAKE} SRCLANGS="bcl" TRGLANGS="en" DEVSET=wikimedia all-job
en-bcl-nt:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia all-job
bcl-en-nt:
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia all-job
%-en-bcl:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
%-bcl-en:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-bcl-en=}
%-en-bcl-nt:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia \
${@:-en-bcl-nt=}
%-bcl-en-nt:
${MAKE} SRCLANGS="bcl" TRGLANGS="en" \
DATASET=${DATASET}+nt \
EXTRA_TRAINSET="new-testament" \
DEVSET=wikimedia \
${@:-bcl-en-nt=}
# ENAS_BPE = 4000
ENAS_BPE = 1000
@ -32,7 +71,35 @@ ENBCL_BPE = 1000
${@:-en-as=}
%-en-bcl:
${MAKE} SRCLANGS="en" TRGLANGS="bcl" DEVSET=wikimedia ${@:-en-bcl=}
WIKI_BT2ENG = abk ady afr amh ang ara arg asm ast awa aze bak bam bar bel ben bod bre bul cat ceb ces cha che chr chv cor cos crh csb cym dan deu dsb ell epo est eus ewe ext fao fas fij fin fra frr fry ful gla gle glg glv got grn guj hat hau haw hbs heb hif hin hsb hun hye ibo ido iku ile ilo ina isl ita jam jav jbo jpn kab kal kan kat kaz khm kin kir kom kor ksh kur lad lao lat lav lfn lij lin lit lmo ltz lug mah mai mal mar mdf mkd mlg mlt mnw mon mri msa mwl mya myv nau nav nds nep nld nor nov nya oci ori oss pag pan pap pdc pms pol por pus que roh rom ron rue run rus sag sah san scn sco sin slv sme smo sna snd som spa sqi stq sun swa swe tah tam tat tel tet tgk tgl tha tir ton tpi tso tuk tur tyv udm uig ukr urd uzb vec vie vol war wln wol xal xho yid yor zho zul
## start jobs for all languages where we have back-translations
wiki-eng2all-with-bt:
for l in ${WIKI_BT2ENG}; do \
if [ -d work-tatoeba/$$l-eng ]; then \
if [ `cat work-tatoeba/$$l-eng/opus-langlabels.src | tr " " "\n" | grep . | wc -l` -eq 1 ]; then \
echo "fetch back-translations for $$l-eng"; \
${MAKE} -C bt-tatoeba SRC=$$l TRG=eng fetch-bt; \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt.submitcpu; \
fi \
fi \
done
WIKI_BT2ENG_PARENTS = ${sort ${shell langgroup -p ${WIKI_BT2ENG}}}
wiki-eng2allgroups-with-bt:
for l in $(filter-out roa,${WIKI_BT2ENG_PARENTS}); do \
if [ -d work-tatoeba/eng-$$l ]; then \
echo "mv work-tatoeba/eng-$$l work-tatoeba-old"; \
mv work-tatoeba/eng-$$l work-tatoeba-old; \
fi; \
echo "start training eng-$$l with backtranslation data"; \
${MAKE} HPC_MEM=32g HPC_CORES=4 tatoeba-eng2$$l-train-bt-1m.submitcpu; \
done

View File

@ -28,6 +28,9 @@ SPMEXTRA =
## set to 1 if you want to generate SPM vocab file
GENERATE_SPM_VOC = 0
# SPM_INPUT_SIZE = 10000000
SPM_INPUT_SIZE = 2000000
SPM_SHUFFLE_INPUT = 0
## we keep the dependency on LOCAL_TRAIN_SRC
## to make multi-threaded make calls behave properly
@ -52,10 +55,12 @@ endif
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_TRAIN} ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=${LOCAL_TRAIN_SRC}.text \
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
--character_coverage=0.9995 --hard_vocab_limit=false; \
else \
${SPM_TRAIN} ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=${LOCAL_TRAIN_SRC}.text \
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
--character_coverage=1.0 --hard_vocab_limit=false; \
fi
mv $@.model $@
@ -82,10 +87,12 @@ else
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_TRAIN} ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=${LOCAL_TRAIN_TRG}.text \
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
--character_coverage=0.9995 --hard_vocab_limit=false; \
else \
${SPM_TRAIN} ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=${LOCAL_TRAIN_TRG}.text \
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
--character_coverage=1.0 --hard_vocab_limit=false; \
fi
mv $@.model $@
@ -161,10 +168,12 @@ ifeq ($(wildcard ${SPMMODEL}),)
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_TRAIN} ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
--character_coverage=0.9995 --hard_vocab_limit=false; \
else \
${SPM_TRAIN} ${SPMEXTRA} \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--input_sentence_size ${SPM_INPUT_SIZE} --shuffle_input_sentence ${SPM_SHUFFLE_INPUT} \
--character_coverage=1.0 --hard_vocab_limit=false; \
fi
mv $@.model $@

View File

@ -12,6 +12,9 @@ ifeq (${SUBWORDS},spm)
## sentence piece models (concatenate and yamlify)
${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL}
ifneq (${MODEL_LATEST_VOCAB},)
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
else
cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab}
cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab}
ifeq (${USE_TARGET_LABELS},1)
@ -23,6 +26,8 @@ endif
cut -f2 $@.numbered | sed 's/\\/\\\\/g;s/\"/\\\"/g;s/^\(.*\)$$/"\1"/;s/$$/:/'> $@.tokens
paste -d ' ' $@.tokens $@.ids > $@
rm -f $@.tokens $@.ids $@.numbered
endif
else
@ -173,10 +178,11 @@ endif
--transformer-dropout ${MARIAN_DROPOUT} \
--label-smoothing 0.1 \
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 --fp16 \
${MARIAN_TIE_EMBEDDINGS} \
--devices ${MARIAN_GPUS} \
--sync-sgd --seed ${SEED} \
--sync-sgd \
--seed ${SEED} \
--sqlite \
--tempdir ${TMPDIR} \
--exponential-smoothing

View File

@ -1,17 +1,3 @@
# wikimedia-2020-01-17.zip
* dataset: wikimedia
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [wikimedia-2020-01-17.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/wikimedia-2020-01-17.zip)
* test set translations: [wikimedia-2020-01-17.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/wikimedia-2020-01-17.test.txt)
* test set scores: [wikimedia-2020-01-17.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/wikimedia-2020-01-17.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.bcl.en | 56.8 | 0.705 |
# opus-2020-01-20.zip
@ -28,6 +14,7 @@
|-----------------------|-------|-------|
| JW300.bcl.en | 56.8 | 0.705 |
# opus-2020-02-11.zip
* dataset: opus
@ -43,6 +30,7 @@
|-----------------------|-------|-------|
| JW300.bcl.en | 56.1 | 0.697 |
# opus+bt-2020-05-23.zip
* dataset: opus+bt
@ -69,3 +57,38 @@
|-----------------------|-------|-------|
| JW300.bcl.en | 57.6 | 0.712 |
# opus+nt-2021-03-29.zip
* dataset: opus+nt
* model: transformer-align
* source language(s): bcl
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+nt-2021-03-29.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt-2021-03-29.zip)
## Training data: opus+nt
* bcl-en: JW300 (470468) new-testament (11623)
* bcl-en: total size = 482091
* total size (opus+nt): 482047
## Validation data
* bcl-en: wikimedia, 1153
* total-size-shuffled: 775
* devset-selected: top 250 lines of wikimedia.src.shuffled!
* testset-selected: next 525 lines of wikimedia.src.shuffled!
* devset-unused: added to traindata
* test set translations: [opus+nt-2021-03-29.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt-2021-03-29.test.txt)
* test set scores: [opus+nt-2021-03-29.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus+nt-2021-03-29.eval.txt)
## Benchmarks
| testset | BLEU | chr-F | #sent | #words | BP |
|---------|-------|-------|-------|--------|----|
| wikimedia.bcl-en | 10.4 | 0.320 | 525 | 27109 | 0.477 |

View File

@ -16,7 +16,10 @@ my $file = shift(@ARGV);
my $count=0;
my $repeated=0;
while ($count < $size){
open F,"<$file" || die "cannot read from $file!\n";
if ($file=~/\.gz$/){
open F,"gzip -cd <$file |" || die "cannot open $file";
}
else{ open F,"<$file" || die "cannot read from $file!\n"; }
while (<F>){
$count++;
print;

View File

@ -12,12 +12,7 @@
# to avoid removing newline characters!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
fi
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
if [ "$4" == "noflags" ]; then
@ -97,7 +92,7 @@ else
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;'
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"

View File

@ -12,12 +12,7 @@
# to avoid removing newline characters!
#
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
fi
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
## simple pre-processing steps adapted from Moses tools
@ -57,7 +52,7 @@ sed -e 's//,/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;'
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2