fixed multilingual tatoeba evaluation

This commit is contained in:
Joerg Tiedemann 2020-06-11 00:54:40 +03:00
parent cc16be10d4
commit e141772b34
9 changed files with 277 additions and 42 deletions

View File

@ -376,6 +376,7 @@ eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${
## train and evaluate
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
${MAKE} eval-testsets
## train model and start back-translation jobs once the model is ready
## (requires to create a dist package)

View File

@ -1,4 +1,18 @@
# related projects
* https://browser.mt (bergamot project)
* https://nteu.eu
* https://gourmet-project.eu
* https://elitr.eu
* https://www.european-language-grid.eu
Multilingual data:
* http://lr-coordination.eu (ELRC)
* https://www.pret-a-llod.eu
* https://www.taus.net
further resources: (from http://techiaith.cymru/translation/demo/?lang=en)
contact: Dewi Jones (d.b.jones@bangor.ac.uk)

View File

@ -3,28 +3,62 @@
# model configurations
#
## various ways of setting the model languages
# SRCLANGS = da no sv
# TRGLANGS = fi
## (1) explicitly set source and target languages, for example:
## SRCLANGS="da no sv" TRGLANGS="fi da"
##
## (2) specify language pairs, for example:
## LANGPAIRS="de-en fi-sv da-es"
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
##
## (3) specify language pairs but make a symmetric model, for example:
## LANGPAIRS="de-en fi-sv da-es" SYMMETRIC=1
## this will set SRCLANGS="da de en es fi sv" TRGLANGS="da de en es fi sv"
##
## (4) only specify LANGS, for example
## LANGS="de en sv"
## this will set SRCLANGS="de en sv" SRCLANGS="de en sv"
## if LANGS is set with more than one language
## --> assume multilingual model with the same languages on both sides
## unless SRCLANGS and TRGLANGS are set to something else
ifeq (${words ${LANGS}},1)
## if LANGPAIRS is not set but SRC and TRG are set
## then set LANGPAIRS to SRC-TRG
ifndef LANGPAIRS
ifdef SRC
ifdef TRG
LANGPAIRS := ${SRC}-${TRG}
endif
endif
endif
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
ifdef LANGPAIRS
ifneq (${SYMMETRIC},1)
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
endif
endif
## if LANGPAIRS is set and LANGS is not
## then get all languages in LANGPAIRS
ifdef LANGPAIRS
LANGS ?= ${sort ${subst -, ,${LANGPAIRS}}}
endif
## if more than one language is in LANGS
## then assume a symmetric multilingual model
ifneq (${words ${LANGS}},1)
SRCLANGS ?= ${LANGS}
TRGLANGS ?= ${LANGS}
endif
## set to SRC and TRG if necessary
ifndef SRCLANGS
SRCLANGS := ${SRC}
TRGLANGS := ${TRG}
endif
## Swedish - Finnish as default if nothing is set
## final default is sv-fi
SRCLANGS ?= sv
TRGLANGS ?= fi
## set SRC and TRG unless they are specified already
SRC ?= ${firstword ${SRCLANGS}}
TRG ?= ${lastword ${TRGLANGS}}
@ -59,6 +93,13 @@ SKIP_LANGPAIRS ?= "nothing"
MAX_OVER_SAMPLING ?= 50
## set CHECK_TRAINDATA_SIZE if you want to check that each
## bitext has equal number of lines in source and target
## ---> this only prints a warning if not
##
# CHECK_TRAINDATA_SIZE
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
SPACE = $(empty) $(empty)
@ -419,7 +460,6 @@ ${WORKDIR}/config.mk:
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \
T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
fi; \
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \

View File

@ -169,7 +169,7 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG}
## monolingual data sets (for sentence piece models)
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw
mono-data: ${LOCAL_MONO_DATA}.${PRE}
@ -265,7 +265,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
## TODO: this causes to frequently redo the same data over and over again, does it?
##
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
ifeq (${USE_REST_DEVDATA},1)
LOCAL_TRAINDATA_DEPENDENCIES = ${DEV_SRC} ${DEV_TRG}
@ -274,6 +274,7 @@ endif
## add training data for each language combination
## and put it together in local space
${LOCAL_TRAIN_SRC}: ${LOCAL_TRAINDATA_DEPENDENCIES}
# ifeq (${wildcard $@},)
mkdir -p ${dir $@}
echo "" > ${dir $@}README.md
echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md
@ -295,23 +296,43 @@ ifeq (${USE_REST_DEVDATA},1)
${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \
fi
endif
# else
# @echo "*****************************************"
# @echo "local training data $@ exists already!"
# @echo "delete if it needs to be re-done!!!"
# @echo "*****************************************"
# endif
${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC}
@echo "done!"
## cut the data sets immediately if we don't have
## to shuffle first! This saves a lot of time!
ifndef SHUFFLE_DATA
ifdef FIT_DATA_SIZE
CUT_DATA_SETS = | head -${FIT_DATA_SIZE}
endif
endif
## add to the training data
## NEW: take away dependence on the clean pre-processed data
## to avoid re-doing existing data and also avoid problems
## of extra data that do not exist for a particular language pair
## in multilingual data sets
## TODO: introduce under and over-sampling for multilingual data sets ...
add-to-local-train-data:
ifneq (${CLEAN_TRAIN_SRC},)
${MAKE} ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
endif
ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
ifdef CHECK_TRAINDATA_SIZE
@if [ `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
@ -323,9 +344,10 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
echo ${CLEAN_TRAIN_SRC}; \
echo ${CLEAN_TRAIN_TRG}; \
fi
endif
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \
l=`${GZIP} -cd < $$d | wc -l`; \
l=`${GZIP} -cd < $$d ${CUT_DATA_SETS} | wc -l`; \
if [ $$l -gt 0 ]; then \
echo "$$d" | xargs basename | \
sed -e 's#.${SRC}.gz$$##' \
@ -340,13 +362,13 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
######################################
ifeq (${USE_TARGET_LABELS},1)
echo "set target language labels";
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
else
echo "only one target language"
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
endif
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
######################################
# SHUFFLE_DATA is set?
# --> shuffle data for each langpair

View File

@ -111,6 +111,7 @@ endif
## other tools and their locations
SCRIPTDIR = ${PWD}/scripts
WORDALIGN = ${EFLOMAL_HOME}align.py
ATOOLS = ${FASTALIGN_HOME}atools

View File

@ -46,6 +46,7 @@ TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challen
TATOEBA_WORK = ${PWD}/work-tatoeba
TATOEBA_DATA = ${TATOEBA_WORK}/data/${PRE}
tatoeba-job:
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
@ -60,7 +61,6 @@ ifneq (${SRCLANGS},${TRGLANGS})
endif
tatoeba-prepare: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba
@ -74,7 +74,13 @@ tatoeba-eval:
tatoeba-data: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
tatoeba-labels: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
tatoeba-results-md:
${MAKE} tatoeba-results-BLEU-sorted.md \
tatoeba-results-BLEU-sorted-model.md \
tatoeba-results-BLEU-sorted-langpair.md \
tatoeba-results-chrF2-sorted.md \
tatoeba-results-chrF2-sorted-model.md \
tatoeba-results-chrF2-sorted-langpair.md
@ -89,21 +95,31 @@ tatoeba-subset-%: tatoeba-%.md
done
## set FIT_DATA_SIZE for under/over-sampling of data!
###############################################################################
## multilingual models from an entire subset
###############################################################################
## training:
## set FIT_DATA_SIZE to biggest one in subset but at least 10000
## set of language pairs is directly taken from the markdown page at github
tatoeba-multilingual-subset-%: tatoeba-%.md
for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
s=`echo $$l | cut -f1 -d '-'`; \
t=`echo $$l | cut -f2 -d '-'`; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-data; \
done
${MAKE} ${patsubst tatoeba-%.md,tatoeba-trainsize-%.txt,$<}
( l=`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr ' -' "\n\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \
s=`sort -k2,2nr ${patsubst tatoeba-%.md,tatoeba-trainsize-%.txt,$<} | head -1 | cut -f2 -d' '`; \
tatoeba-multilingual-subset-%: tatoeba-%.md tatoeba-trainsize-%.txt
( l="${shell grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr "\n" ' ' | sed 's/ *$$//'}"; \
s=${shell sort -k2,2nr $(word 2,$^) | head -1 | cut -f2 -d' '}; \
if [ $$s -lt 10000 ]; then s=10000; fi; \
${MAKE} FIT_DATA_SIZE=$$s \
SRCLANGS="$$l" TRGLANGS="$$l" \
LANGPAIRSTR=${<:.md=} tatoeba-job )
${MAKE} LANGPAIRS="$$l" \
SYMMETRIC=1 \
FIT_DATA_SIZE=$$s \
LANGPAIRSTR=${<:.md=} \
tatoeba-multilingual-train; )
## evaluate all language pairs in both directions
tatoeba-multilingual-evalsubset-%: tatoeba-%.md
${MAKE} LANGPAIRS="`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr \"\n\" ' '`" \
LANGPAIRSTR=${<:.md=} tatoeba-multilingual-testsets
${MAKE} LANGPAIRS="`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr \"\n\" ' '`" \
LANGPAIRSTR=${<:.md=} SYMMETRIC=1 tatoeba-multilingual-eval
## print all data sizes in this set
tatoeba-trainsize-%.txt: tatoeba-%.md
@ -120,11 +136,86 @@ tatoeba-%.md:
###############################################################################
## evaluate multilingual models for various language pairs
###############################################################################
tatoeba-multilingual-train:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ $$s \< $$t ]; then \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-data; \
else \
${MAKE} SRCLANGS=$$t TRGLANGS=$$s tatoeba-data; \
fi
done \
done
${MAKE} tatoeba-job
## evaluate all individual language pairs for a multilingual model
tatoeba-multilingual-eval:
${MAKE} tatoeba-multilingual-testsets
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src ]; then \
${MAKE} SRC=$$s TRG=$$t \
TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
TESTSET=Tatoeba-test.$$s-$$t \
TESTSET_NAME=Tatoeba-test.$$s-$$t \
USE_REST_DEVDATA=0 \
HELDOUTSIZE=0 \
DEVSIZE=5000 \
TESTSIZE=10000 \
DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \
USE_TARGET_LABELS=1 \
compare; \
fi \
done \
done
# print-info:
## copy testsets into the multilingual test directory
tatoeba-multilingual-testsets:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src ]; then \
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ]; then \
echo "make Tatoeba-test.$$s-$$t"; \
cut -f2,3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt | sed 's/^\([^ ]*\) />>\1<< /' \
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src; \
cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt \
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.trg; \
else \
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ]; then \
echo "make Tatoeba-test.$$s-$$t"; \
cut -f1,4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt | sed 's/^\([^ ]*\) />>\1<< /' \
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src; \
cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt \
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.trg; \
fi \
fi; \
rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt; \
fi \
done \
done
###############################################################################
## generic targets for tatoba models
###############################################################################
## generic target for tatoeba challenge jobs
# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
# ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
%-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
${MAKE} TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
@ -139,11 +230,15 @@ tatoeba-%.md:
WORKHOME=${TATOEBA_WORK} \
SRCLANGS="${shell cat $< | sed 's/ *$$//'}" \
TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels) | sed 's/ *$$//'}" \
LANGPAIRSTR=${LANGPAIRSTR} \
EMAIL= \
${@:-tatoeba=}
## all language labels in all language pairs
## (each language pair may include several language variants)
## --> this is necessary to set the languages that are present in a model
${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
@ -187,6 +282,10 @@ ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
###############################################################################
## generate data files
###############################################################################
## don't delete those files
.SECONDARY: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
@ -322,3 +421,59 @@ FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/
%/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo "done!"
## make Tatoeba test files available in testset collection
## --> useful for testing various languages when creating multilingual models
testsets/${LANGPAIR}/Tatoeba-test.${LANGPAIR}.%: ${TATOEBA_DATA}/Tatoeba-test.${LANGPAIR}.clean.%
mkdir -p ${dir $@}
cp $< $@
tatoeba-results%.md: tatoeba-results%
echo "# Tatoeba translation results" >$@
echo "" >>$@
echo "| Model | LangPair | Score | Details |" >> $@
echo "|-----------------:|------------|-----------:|---------:|" >> $@
cat $< | sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@
tatoeba-results-BLEU-sorted:
grep BLEU work-tatoeba/*/*eval | \
sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' |sort -k3,3n | \
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | grep -v eval > $@
tatoeba-results-BLEU-sorted-model:
grep BLEU work-tatoeba/*/*eval | \
sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \
grep -v eval | sort -k1,1 -k3,3n > $@
tatoeba-results-BLEU-sorted-langpair:
grep BLEU work-tatoeba/*/*eval | \
sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \
grep -v eval | sort -k2,2 -k3,3n > $@
tatoeba-results-chrF2-sorted:
grep chrF2 work-tatoeba/*/*eval | \
sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' |sort -k3,3n | \
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
sed 's#.eval: = #\t#' > $@
tatoeba-results-chrF2-sorted-model:
grep chrF2 work-tatoeba/*/*eval | \
sed 's/chrF.*1.4.2//' | cut -f2- -d'/' | \
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
sed 's#.eval: = #\t#' | sort -k1,1 -k3,3n > $@
tatoeba-results-chrF2-sorted-langpair:
grep chrF2 work-tatoeba/*/*eval | \
sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \
sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \
sed 's#.eval: = #\t#' | sort -k2,2 -k3,3n > $@

View File

@ -13,6 +13,8 @@
# ---> don't need to re-create models for each language pair
#
.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE}.charfreq
.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
##----------------------------------------------
## sentence piece
@ -240,7 +242,7 @@ endif
## document-level models (with guided alignment)
%.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz:
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k wordalign
./large-context.pl -l ${CONTEXT_SIZE} \
${SCRIPTDIR}/large-context.pl -l ${CONTEXT_SIZE} \
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.src.spm${SRCBPESIZE:000=}k.gz,$@} \
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.trg.spm${TRGBPESIZE:000=}k.gz,$@} \
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.spm${SRCBPESIZE:000=}k-spm${TRGBPESIZE:000=}k.src-trg.alg.gz,$@} \

0
verify-wordalign.pl → scripts/verify-wordalign.pl Normal file → Executable file
View File