mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
fixed multilingual tatoeba evaluation
This commit is contained in:
parent
cc16be10d4
commit
e141772b34
1
Makefile
1
Makefile
@ -376,6 +376,7 @@ eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${
|
||||
## train and evaluate
|
||||
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
||||
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
|
||||
${MAKE} eval-testsets
|
||||
|
||||
## train model and start back-translation jobs once the model is ready
|
||||
## (requires to create a dist package)
|
||||
|
14
NOTES.md
14
NOTES.md
@ -1,4 +1,18 @@
|
||||
|
||||
# related projects
|
||||
|
||||
* https://browser.mt (bergamot project)
|
||||
* https://nteu.eu
|
||||
* https://gourmet-project.eu
|
||||
* https://elitr.eu
|
||||
* https://www.european-language-grid.eu
|
||||
|
||||
Multilingual data:
|
||||
|
||||
* http://lr-coordination.eu (ELRC)
|
||||
* https://www.pret-a-llod.eu
|
||||
* https://www.taus.net
|
||||
|
||||
|
||||
further resources: (from http://techiaith.cymru/translation/demo/?lang=en)
|
||||
contact: Dewi Jones (d.b.jones@bangor.ac.uk)
|
||||
|
@ -3,28 +3,62 @@
|
||||
# model configurations
|
||||
#
|
||||
|
||||
## various ways of setting the model languages
|
||||
|
||||
# SRCLANGS = da no sv
|
||||
# TRGLANGS = fi
|
||||
## (1) explicitly set source and target languages, for example:
|
||||
## SRCLANGS="da no sv" TRGLANGS="fi da"
|
||||
##
|
||||
## (2) specify language pairs, for example:
|
||||
## LANGPAIRS="de-en fi-sv da-es"
|
||||
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
|
||||
##
|
||||
## (3) specify language pairs but make a symmetric model, for example:
|
||||
## LANGPAIRS="de-en fi-sv da-es" SYMMETRIC=1
|
||||
## this will set SRCLANGS="da de en es fi sv" TRGLANGS="da de en es fi sv"
|
||||
##
|
||||
## (4) only specify LANGS, for example
|
||||
## LANGS="de en sv"
|
||||
## this will set SRCLANGS="de en sv" SRCLANGS="de en sv"
|
||||
|
||||
## if LANGS is set with more than one language
|
||||
## --> assume multilingual model with the same languages on both sides
|
||||
## unless SRCLANGS and TRGLANGS are set to something else
|
||||
ifeq (${words ${LANGS}},1)
|
||||
|
||||
|
||||
## if LANGPAIRS is not set but SRC and TRG are set
|
||||
## then set LANGPAIRS to SRC-TRG
|
||||
ifndef LANGPAIRS
|
||||
ifdef SRC
|
||||
ifdef TRG
|
||||
LANGPAIRS := ${SRC}-${TRG}
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
|
||||
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
|
||||
ifdef LANGPAIRS
|
||||
ifneq (${SYMMETRIC},1)
|
||||
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
|
||||
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
|
||||
endif
|
||||
endif
|
||||
|
||||
## if LANGPAIRS is set and LANGS is not
|
||||
## then get all languages in LANGPAIRS
|
||||
ifdef LANGPAIRS
|
||||
LANGS ?= ${sort ${subst -, ,${LANGPAIRS}}}
|
||||
endif
|
||||
|
||||
## if more than one language is in LANGS
|
||||
## then assume a symmetric multilingual model
|
||||
ifneq (${words ${LANGS}},1)
|
||||
SRCLANGS ?= ${LANGS}
|
||||
TRGLANGS ?= ${LANGS}
|
||||
endif
|
||||
|
||||
## set to SRC and TRG if necessary
|
||||
ifndef SRCLANGS
|
||||
SRCLANGS := ${SRC}
|
||||
TRGLANGS := ${TRG}
|
||||
endif
|
||||
|
||||
## Swedish - Finnish as default if nothing is set
|
||||
## final default is sv-fi
|
||||
SRCLANGS ?= sv
|
||||
TRGLANGS ?= fi
|
||||
|
||||
|
||||
## set SRC and TRG unless they are specified already
|
||||
SRC ?= ${firstword ${SRCLANGS}}
|
||||
TRG ?= ${lastword ${TRGLANGS}}
|
||||
@ -59,6 +93,13 @@ SKIP_LANGPAIRS ?= "nothing"
|
||||
MAX_OVER_SAMPLING ?= 50
|
||||
|
||||
|
||||
## set CHECK_TRAINDATA_SIZE if you want to check that each
|
||||
## bitext has equal number of lines in source and target
|
||||
## ---> this only prints a warning if not
|
||||
##
|
||||
# CHECK_TRAINDATA_SIZE
|
||||
|
||||
|
||||
# sorted languages and langpair used to match resources in OPUS
|
||||
SORTLANGS = $(sort ${SRC} ${TRG})
|
||||
SPACE = $(empty) $(empty)
|
||||
@ -419,7 +460,6 @@ ${WORKDIR}/config.mk:
|
||||
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
|
||||
S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \
|
||||
T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \
|
||||
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
|
||||
fi; \
|
||||
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \
|
||||
|
36
lib/data.mk
36
lib/data.mk
@ -169,7 +169,7 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG}
|
||||
|
||||
|
||||
## monolingual data sets (for sentence piece models)
|
||||
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw
|
||||
|
||||
mono-data: ${LOCAL_MONO_DATA}.${PRE}
|
||||
|
||||
@ -265,7 +265,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
|
||||
## TODO: this causes to frequently redo the same data over and over again, does it?
|
||||
##
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
||||
|
||||
ifeq (${USE_REST_DEVDATA},1)
|
||||
LOCAL_TRAINDATA_DEPENDENCIES = ${DEV_SRC} ${DEV_TRG}
|
||||
@ -274,6 +274,7 @@ endif
|
||||
## add training data for each language combination
|
||||
## and put it together in local space
|
||||
${LOCAL_TRAIN_SRC}: ${LOCAL_TRAINDATA_DEPENDENCIES}
|
||||
# ifeq (${wildcard $@},)
|
||||
mkdir -p ${dir $@}
|
||||
echo "" > ${dir $@}README.md
|
||||
echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md
|
||||
@ -295,23 +296,43 @@ ifeq (${USE_REST_DEVDATA},1)
|
||||
${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \
|
||||
fi
|
||||
endif
|
||||
# else
|
||||
# @echo "*****************************************"
|
||||
# @echo "local training data $@ exists already!"
|
||||
# @echo "delete if it needs to be re-done!!!"
|
||||
# @echo "*****************************************"
|
||||
# endif
|
||||
|
||||
|
||||
${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC}
|
||||
@echo "done!"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## cut the data sets immediately if we don't have
|
||||
## to shuffle first! This saves a lot of time!
|
||||
|
||||
ifndef SHUFFLE_DATA
|
||||
ifdef FIT_DATA_SIZE
|
||||
CUT_DATA_SETS = | head -${FIT_DATA_SIZE}
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
## add to the training data
|
||||
## NEW: take away dependence on the clean pre-processed data
|
||||
## to avoid re-doing existing data and also avoid problems
|
||||
## of extra data that do not exist for a particular language pair
|
||||
## in multilingual data sets
|
||||
## TODO: introduce under and over-sampling for multilingual data sets ...
|
||||
|
||||
add-to-local-train-data:
|
||||
ifneq (${CLEAN_TRAIN_SRC},)
|
||||
${MAKE} ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
|
||||
endif
|
||||
ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
|
||||
ifdef CHECK_TRAINDATA_SIZE
|
||||
@if [ `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
@ -323,9 +344,10 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
|
||||
echo ${CLEAN_TRAIN_SRC}; \
|
||||
echo ${CLEAN_TRAIN_TRG}; \
|
||||
fi
|
||||
endif
|
||||
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
|
||||
l=`${GZIP} -cd < $$d | wc -l`; \
|
||||
l=`${GZIP} -cd < $$d ${CUT_DATA_SETS} | wc -l`; \
|
||||
if [ $$l -gt 0 ]; then \
|
||||
echo "$$d" | xargs basename | \
|
||||
sed -e 's#.${SRC}.gz$$##' \
|
||||
@ -340,13 +362,13 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
|
||||
######################################
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "set target language labels";
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
|
||||
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
else
|
||||
echo "only one target language"
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
endif
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
######################################
|
||||
# SHUFFLE_DATA is set?
|
||||
# --> shuffle data for each langpair
|
||||
|
@ -111,6 +111,7 @@ endif
|
||||
|
||||
## other tools and their locations
|
||||
|
||||
SCRIPTDIR = ${PWD}/scripts
|
||||
WORDALIGN = ${EFLOMAL_HOME}align.py
|
||||
ATOOLS = ${FASTALIGN_HOME}atools
|
||||
|
||||
|
@ -46,6 +46,7 @@ TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challen
|
||||
TATOEBA_WORK = ${PWD}/work-tatoeba
|
||||
TATOEBA_DATA = ${TATOEBA_WORK}/data/${PRE}
|
||||
|
||||
|
||||
tatoeba-job:
|
||||
${MAKE} tatoeba-prepare
|
||||
${MAKE} all-job-tatoeba
|
||||
@ -60,7 +61,6 @@ ifneq (${SRCLANGS},${TRGLANGS})
|
||||
endif
|
||||
|
||||
|
||||
|
||||
tatoeba-prepare: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
${MAKE} local-config-tatoeba
|
||||
${MAKE} data-tatoeba
|
||||
@ -74,7 +74,13 @@ tatoeba-eval:
|
||||
tatoeba-data: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
tatoeba-labels: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
|
||||
|
||||
|
||||
tatoeba-results-md:
|
||||
${MAKE} tatoeba-results-BLEU-sorted.md \
|
||||
tatoeba-results-BLEU-sorted-model.md \
|
||||
tatoeba-results-BLEU-sorted-langpair.md \
|
||||
tatoeba-results-chrF2-sorted.md \
|
||||
tatoeba-results-chrF2-sorted-model.md \
|
||||
tatoeba-results-chrF2-sorted-langpair.md
|
||||
|
||||
|
||||
|
||||
@ -89,21 +95,31 @@ tatoeba-subset-%: tatoeba-%.md
|
||||
done
|
||||
|
||||
|
||||
## set FIT_DATA_SIZE for under/over-sampling of data!
|
||||
###############################################################################
|
||||
## multilingual models from an entire subset
|
||||
###############################################################################
|
||||
|
||||
## training:
|
||||
## set FIT_DATA_SIZE to biggest one in subset but at least 10000
|
||||
## set of language pairs is directly taken from the markdown page at github
|
||||
tatoeba-multilingual-subset-%: tatoeba-%.md
|
||||
for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
|
||||
s=`echo $$l | cut -f1 -d '-'`; \
|
||||
t=`echo $$l | cut -f2 -d '-'`; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-data; \
|
||||
done
|
||||
${MAKE} ${patsubst tatoeba-%.md,tatoeba-trainsize-%.txt,$<}
|
||||
( l=`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr ' -' "\n\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \
|
||||
s=`sort -k2,2nr ${patsubst tatoeba-%.md,tatoeba-trainsize-%.txt,$<} | head -1 | cut -f2 -d' '`; \
|
||||
tatoeba-multilingual-subset-%: tatoeba-%.md tatoeba-trainsize-%.txt
|
||||
( l="${shell grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr "\n" ' ' | sed 's/ *$$//'}"; \
|
||||
s=${shell sort -k2,2nr $(word 2,$^) | head -1 | cut -f2 -d' '}; \
|
||||
if [ $$s -lt 10000 ]; then s=10000; fi; \
|
||||
${MAKE} FIT_DATA_SIZE=$$s \
|
||||
SRCLANGS="$$l" TRGLANGS="$$l" \
|
||||
LANGPAIRSTR=${<:.md=} tatoeba-job )
|
||||
${MAKE} LANGPAIRS="$$l" \
|
||||
SYMMETRIC=1 \
|
||||
FIT_DATA_SIZE=$$s \
|
||||
LANGPAIRSTR=${<:.md=} \
|
||||
tatoeba-multilingual-train; )
|
||||
|
||||
|
||||
## evaluate all language pairs in both directions
|
||||
tatoeba-multilingual-evalsubset-%: tatoeba-%.md
|
||||
${MAKE} LANGPAIRS="`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr \"\n\" ' '`" \
|
||||
LANGPAIRSTR=${<:.md=} tatoeba-multilingual-testsets
|
||||
${MAKE} LANGPAIRS="`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr \"\n\" ' '`" \
|
||||
LANGPAIRSTR=${<:.md=} SYMMETRIC=1 tatoeba-multilingual-eval
|
||||
|
||||
|
||||
## print all data sizes in this set
|
||||
tatoeba-trainsize-%.txt: tatoeba-%.md
|
||||
@ -120,11 +136,86 @@ tatoeba-%.md:
|
||||
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
## evaluate multilingual models for various language pairs
|
||||
###############################################################################
|
||||
|
||||
|
||||
tatoeba-multilingual-train:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ $$s \< $$t ]; then \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-data; \
|
||||
else \
|
||||
${MAKE} SRCLANGS=$$t TRGLANGS=$$s tatoeba-data; \
|
||||
fi
|
||||
done \
|
||||
done
|
||||
${MAKE} tatoeba-job
|
||||
|
||||
|
||||
## evaluate all individual language pairs for a multilingual model
|
||||
tatoeba-multilingual-eval:
|
||||
${MAKE} tatoeba-multilingual-testsets
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src ]; then \
|
||||
${MAKE} SRC=$$s TRG=$$t \
|
||||
TRAINSET=Tatoeba-train \
|
||||
DEVSET=Tatoeba-dev \
|
||||
TESTSET=Tatoeba-test.$$s-$$t \
|
||||
TESTSET_NAME=Tatoeba-test.$$s-$$t \
|
||||
USE_REST_DEVDATA=0 \
|
||||
HELDOUTSIZE=0 \
|
||||
DEVSIZE=5000 \
|
||||
TESTSIZE=10000 \
|
||||
DEVMINSIZE=200 \
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
USE_TARGET_LABELS=1 \
|
||||
compare; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
# print-info:
|
||||
|
||||
|
||||
## copy testsets into the multilingual test directory
|
||||
tatoeba-multilingual-testsets:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src ]; then \
|
||||
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \
|
||||
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ]; then \
|
||||
echo "make Tatoeba-test.$$s-$$t"; \
|
||||
cut -f2,3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt | sed 's/^\([^ ]*\) />>\1<< /' \
|
||||
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src; \
|
||||
cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt \
|
||||
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.trg; \
|
||||
else \
|
||||
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \
|
||||
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ]; then \
|
||||
echo "make Tatoeba-test.$$s-$$t"; \
|
||||
cut -f1,4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt | sed 's/^\([^ ]*\) />>\1<< /' \
|
||||
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src; \
|
||||
cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt \
|
||||
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.trg; \
|
||||
fi \
|
||||
fi; \
|
||||
rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
## generic targets for tatoba models
|
||||
###############################################################################
|
||||
|
||||
|
||||
## generic target for tatoeba challenge jobs
|
||||
# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
|
||||
# ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
|
||||
# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
%-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
|
||||
${MAKE} TRAINSET=Tatoeba-train \
|
||||
DEVSET=Tatoeba-dev \
|
||||
@ -139,11 +230,15 @@ tatoeba-%.md:
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
SRCLANGS="${shell cat $< | sed 's/ *$$//'}" \
|
||||
TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels) | sed 's/ *$$//'}" \
|
||||
LANGPAIRSTR=${LANGPAIRSTR} \
|
||||
EMAIL= \
|
||||
${@:-tatoeba=}
|
||||
|
||||
|
||||
|
||||
## all language labels in all language pairs
|
||||
## (each language pair may include several language variants)
|
||||
## --> this is necessary to set the languages that are present in a model
|
||||
|
||||
${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
@ -187,6 +282,10 @@ ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
## generate data files
|
||||
###############################################################################
|
||||
|
||||
|
||||
## don't delete those files
|
||||
.SECONDARY: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
|
||||
@ -322,3 +421,59 @@ FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/
|
||||
|
||||
%/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo "done!"
|
||||
|
||||
|
||||
## make Tatoeba test files available in testset collection
|
||||
## --> useful for testing various languages when creating multilingual models
|
||||
testsets/${LANGPAIR}/Tatoeba-test.${LANGPAIR}.%: ${TATOEBA_DATA}/Tatoeba-test.${LANGPAIR}.clean.%
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
|
||||
|
||||
tatoeba-results%.md: tatoeba-results%
|
||||
echo "# Tatoeba translation results" >$@
|
||||
echo "" >>$@
|
||||
echo "| Model | LangPair | Score | Details |" >> $@
|
||||
echo "|-----------------:|------------|-----------:|---------:|" >> $@
|
||||
cat $< | sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@
|
||||
|
||||
tatoeba-results-BLEU-sorted:
|
||||
grep BLEU work-tatoeba/*/*eval | \
|
||||
sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' |sort -k3,3n | \
|
||||
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
|
||||
sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | grep -v eval > $@
|
||||
|
||||
tatoeba-results-BLEU-sorted-model:
|
||||
grep BLEU work-tatoeba/*/*eval | \
|
||||
sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \
|
||||
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
|
||||
sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \
|
||||
grep -v eval | sort -k1,1 -k3,3n > $@
|
||||
|
||||
tatoeba-results-BLEU-sorted-langpair:
|
||||
grep BLEU work-tatoeba/*/*eval | \
|
||||
sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \
|
||||
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
|
||||
sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \
|
||||
grep -v eval | sort -k2,2 -k3,3n > $@
|
||||
|
||||
tatoeba-results-chrF2-sorted:
|
||||
grep chrF2 work-tatoeba/*/*eval | \
|
||||
sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' |sort -k3,3n | \
|
||||
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
|
||||
sed 's#.eval: = #\t#' > $@
|
||||
|
||||
tatoeba-results-chrF2-sorted-model:
|
||||
grep chrF2 work-tatoeba/*/*eval | \
|
||||
sed 's/chrF.*1.4.2//' | cut -f2- -d'/' | \
|
||||
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
|
||||
sed 's#.eval: = #\t#' | sort -k1,1 -k3,3n > $@
|
||||
|
||||
tatoeba-results-chrF2-sorted-langpair:
|
||||
grep chrF2 work-tatoeba/*/*eval | \
|
||||
sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \
|
||||
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
|
||||
sed 's#.eval: = #\t#' | sort -k2,2 -k3,3n > $@
|
||||
|
||||
|
||||
|
@ -13,6 +13,8 @@
|
||||
# ---> don't need to re-create models for each language pair
|
||||
#
|
||||
|
||||
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
|
||||
|
||||
##----------------------------------------------
|
||||
## sentence piece
|
||||
@ -240,7 +242,7 @@ endif
|
||||
## document-level models (with guided alignment)
|
||||
%.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz:
|
||||
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k wordalign
|
||||
./large-context.pl -l ${CONTEXT_SIZE} \
|
||||
${SCRIPTDIR}/large-context.pl -l ${CONTEXT_SIZE} \
|
||||
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.src.spm${SRCBPESIZE:000=}k.gz,$@} \
|
||||
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.trg.spm${TRGBPESIZE:000=}k.gz,$@} \
|
||||
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.spm${SRCBPESIZE:000=}k-spm${TRGBPESIZE:000=}k.src-trg.alg.gz,$@} \
|
||||
|
0
verify-wordalign.pl → scripts/verify-wordalign.pl
Normal file → Executable file
0
verify-wordalign.pl → scripts/verify-wordalign.pl
Normal file → Executable file
Loading…
Reference in New Issue
Block a user