From e141772b34522fb092d61d56195862763bec9d12 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Thu, 11 Jun 2020 00:54:40 +0300 Subject: [PATCH] fixed multilingual tatoeba evaluation --- Makefile | 1 + NOTES.md | 14 ++ lib/config.mk | 68 ++++-- lib/data.mk | 36 +++- lib/env.mk | 1 + lib/models/tatoeba.mk | 195 ++++++++++++++++-- lib/sentencepiece.mk | 4 +- large-context.pl => scripts/large-context.pl | 0 .../verify-wordalign.pl | 0 9 files changed, 277 insertions(+), 42 deletions(-) rename large-context.pl => scripts/large-context.pl (100%) rename verify-wordalign.pl => scripts/verify-wordalign.pl (100%) mode change 100644 => 100755 diff --git a/Makefile b/Makefile index 3e0c0031..69e1dd5c 100644 --- a/Makefile +++ b/Makefile @@ -376,6 +376,7 @@ eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${ ## train and evaluate train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done ${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare + ${MAKE} eval-testsets ## train model and start back-translation jobs once the model is ready ## (requires to create a dist package) diff --git a/NOTES.md b/NOTES.md index 166668aa..26f1dbb3 100644 --- a/NOTES.md +++ b/NOTES.md @@ -1,4 +1,18 @@ +# related projects + +* https://browser.mt (bergamot project) +* https://nteu.eu +* https://gourmet-project.eu +* https://elitr.eu +* https://www.european-language-grid.eu + +Multilingual data: + +* http://lr-coordination.eu (ELRC) +* https://www.pret-a-llod.eu +* https://www.taus.net + further resources: (from http://techiaith.cymru/translation/demo/?lang=en) contact: Dewi Jones (d.b.jones@bangor.ac.uk) diff --git a/lib/config.mk b/lib/config.mk index 1c41ee9c..d4616745 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -3,28 +3,62 @@ # model configurations # +## various ways of setting the model languages -# SRCLANGS = da no sv -# TRGLANGS = fi +## (1) explicitly set source and target languages, for example: +## SRCLANGS="da no sv" TRGLANGS="fi da" +## +## (2) specify language pairs, for example: +## LANGPAIRS="de-en fi-sv da-es" +## this will set SRCLANGS="de fi da" TRGLANGS="en sv es" +## +## (3) specify language pairs but make a symmetric model, for example: +## LANGPAIRS="de-en fi-sv da-es" SYMMETRIC=1 +## this will set SRCLANGS="da de en es fi sv" TRGLANGS="da de en es fi sv" +## +## (4) only specify LANGS, for example +## LANGS="de en sv" +## this will set SRCLANGS="de en sv" SRCLANGS="de en sv" -## if LANGS is set with more than one language -## --> assume multilingual model with the same languages on both sides -## unless SRCLANGS and TRGLANGS are set to something else -ifeq (${words ${LANGS}},1) + + +## if LANGPAIRS is not set but SRC and TRG are set +## then set LANGPAIRS to SRC-TRG +ifndef LANGPAIRS +ifdef SRC +ifdef TRG + LANGPAIRS := ${SRC}-${TRG} +endif +endif +endif + +## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC +## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS +ifdef LANGPAIRS +ifneq (${SYMMETRIC},1) + SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}} + TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}} +endif +endif + +## if LANGPAIRS is set and LANGS is not +## then get all languages in LANGPAIRS +ifdef LANGPAIRS + LANGS ?= ${sort ${subst -, ,${LANGPAIRS}}} +endif + +## if more than one language is in LANGS +## then assume a symmetric multilingual model +ifneq (${words ${LANGS}},1) SRCLANGS ?= ${LANGS} TRGLANGS ?= ${LANGS} endif -## set to SRC and TRG if necessary -ifndef SRCLANGS - SRCLANGS := ${SRC} - TRGLANGS := ${TRG} -endif - -## Swedish - Finnish as default if nothing is set +## final default is sv-fi SRCLANGS ?= sv TRGLANGS ?= fi + ## set SRC and TRG unless they are specified already SRC ?= ${firstword ${SRCLANGS}} TRG ?= ${lastword ${TRGLANGS}} @@ -59,6 +93,13 @@ SKIP_LANGPAIRS ?= "nothing" MAX_OVER_SAMPLING ?= 50 +## set CHECK_TRAINDATA_SIZE if you want to check that each +## bitext has equal number of lines in source and target +## ---> this only prints a warning if not +## +# CHECK_TRAINDATA_SIZE + + # sorted languages and langpair used to match resources in OPUS SORTLANGS = $(sort ${SRC} ${TRG}) SPACE = $(empty) $(empty) @@ -419,7 +460,6 @@ ${WORKDIR}/config.mk: s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \ S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \ T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \ - rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \ fi; \ if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \ echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \ diff --git a/lib/data.mk b/lib/data.mk index 296d2d08..1f0b7a1d 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -169,7 +169,7 @@ clean-data-source: ${DATA_SRC} ${DATA_TRG} ## monolingual data sets (for sentence piece models) -.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw ${LOCAL_MONO_DATA}.${PRE}.charfreq +.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.raw mono-data: ${LOCAL_MONO_DATA}.${PRE} @@ -265,7 +265,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ ## TODO: this causes to frequently redo the same data over and over again, does it? ## -.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq +.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ifeq (${USE_REST_DEVDATA},1) LOCAL_TRAINDATA_DEPENDENCIES = ${DEV_SRC} ${DEV_TRG} @@ -274,6 +274,7 @@ endif ## add training data for each language combination ## and put it together in local space ${LOCAL_TRAIN_SRC}: ${LOCAL_TRAINDATA_DEPENDENCIES} +# ifeq (${wildcard $@},) mkdir -p ${dir $@} echo "" > ${dir $@}README.md echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md @@ -295,23 +296,43 @@ ifeq (${USE_REST_DEVDATA},1) ${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \ fi endif +# else +# @echo "*****************************************" +# @echo "local training data $@ exists already!" +# @echo "delete if it needs to be re-done!!!" +# @echo "*****************************************" +# endif ${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC} @echo "done!" + + + +## cut the data sets immediately if we don't have +## to shuffle first! This saves a lot of time! + +ifndef SHUFFLE_DATA +ifdef FIT_DATA_SIZE + CUT_DATA_SETS = | head -${FIT_DATA_SIZE} +endif +endif + + ## add to the training data ## NEW: take away dependence on the clean pre-processed data ## to avoid re-doing existing data and also avoid problems ## of extra data that do not exist for a particular language pair ## in multilingual data sets -## TODO: introduce under and over-sampling for multilingual data sets ... + add-to-local-train-data: ifneq (${CLEAN_TRAIN_SRC},) ${MAKE} ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG} endif ifneq (${wildcard ${CLEAN_TRAIN_SRC}},) +ifdef CHECK_TRAINDATA_SIZE @if [ `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \ @@ -323,9 +344,10 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},) echo ${CLEAN_TRAIN_SRC}; \ echo ${CLEAN_TRAIN_TRG}; \ fi +endif echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md for d in ${wildcard ${CLEAN_TRAIN_SRC}}; do \ - l=`${GZIP} -cd < $$d | wc -l`; \ + l=`${GZIP} -cd < $$d ${CUT_DATA_SETS} | wc -l`; \ if [ $$l -gt 0 ]; then \ echo "$$d" | xargs basename | \ sed -e 's#.${SRC}.gz$$##' \ @@ -340,13 +362,13 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},) ###################################### ifeq (${USE_TARGET_LABELS},1) echo "set target language labels"; - ${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\ + ${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\ sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src else echo "only one target language" - ${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src + ${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src endif - ${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg + ${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg ###################################### # SHUFFLE_DATA is set? # --> shuffle data for each langpair diff --git a/lib/env.mk b/lib/env.mk index 36ee748d..c4d917ed 100644 --- a/lib/env.mk +++ b/lib/env.mk @@ -111,6 +111,7 @@ endif ## other tools and their locations +SCRIPTDIR = ${PWD}/scripts WORDALIGN = ${EFLOMAL_HOME}align.py ATOOLS = ${FASTALIGN_HOME}atools diff --git a/lib/models/tatoeba.mk b/lib/models/tatoeba.mk index 9c194e92..f3ed83cd 100644 --- a/lib/models/tatoeba.mk +++ b/lib/models/tatoeba.mk @@ -46,6 +46,7 @@ TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challen TATOEBA_WORK = ${PWD}/work-tatoeba TATOEBA_DATA = ${TATOEBA_WORK}/data/${PRE} + tatoeba-job: ${MAKE} tatoeba-prepare ${MAKE} all-job-tatoeba @@ -60,7 +61,6 @@ ifneq (${SRCLANGS},${TRGLANGS}) endif - tatoeba-prepare: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz ${MAKE} local-config-tatoeba ${MAKE} data-tatoeba @@ -74,7 +74,13 @@ tatoeba-eval: tatoeba-data: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz tatoeba-labels: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels - +tatoeba-results-md: + ${MAKE} tatoeba-results-BLEU-sorted.md \ + tatoeba-results-BLEU-sorted-model.md \ + tatoeba-results-BLEU-sorted-langpair.md \ + tatoeba-results-chrF2-sorted.md \ + tatoeba-results-chrF2-sorted-model.md \ + tatoeba-results-chrF2-sorted-langpair.md @@ -89,21 +95,31 @@ tatoeba-subset-%: tatoeba-%.md done -## set FIT_DATA_SIZE for under/over-sampling of data! +############################################################################### +## multilingual models from an entire subset +############################################################################### + +## training: +## set FIT_DATA_SIZE to biggest one in subset but at least 10000 ## set of language pairs is directly taken from the markdown page at github -tatoeba-multilingual-subset-%: tatoeba-%.md - for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \ - s=`echo $$l | cut -f1 -d '-'`; \ - t=`echo $$l | cut -f2 -d '-'`; \ - ${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-data; \ - done - ${MAKE} ${patsubst tatoeba-%.md,tatoeba-trainsize-%.txt,$<} - ( l=`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr ' -' "\n\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//'`; \ - s=`sort -k2,2nr ${patsubst tatoeba-%.md,tatoeba-trainsize-%.txt,$<} | head -1 | cut -f2 -d' '`; \ +tatoeba-multilingual-subset-%: tatoeba-%.md tatoeba-trainsize-%.txt + ( l="${shell grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr "\n" ' ' | sed 's/ *$$//'}"; \ + s=${shell sort -k2,2nr $(word 2,$^) | head -1 | cut -f2 -d' '}; \ if [ $$s -lt 10000 ]; then s=10000; fi; \ - ${MAKE} FIT_DATA_SIZE=$$s \ - SRCLANGS="$$l" TRGLANGS="$$l" \ - LANGPAIRSTR=${<:.md=} tatoeba-job ) + ${MAKE} LANGPAIRS="$$l" \ + SYMMETRIC=1 \ + FIT_DATA_SIZE=$$s \ + LANGPAIRSTR=${<:.md=} \ + tatoeba-multilingual-train; ) + + +## evaluate all language pairs in both directions +tatoeba-multilingual-evalsubset-%: tatoeba-%.md + ${MAKE} LANGPAIRS="`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr \"\n\" ' '`" \ + LANGPAIRSTR=${<:.md=} tatoeba-multilingual-testsets + ${MAKE} LANGPAIRS="`grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']' | tr \"\n\" ' '`" \ + LANGPAIRSTR=${<:.md=} SYMMETRIC=1 tatoeba-multilingual-eval + ## print all data sizes in this set tatoeba-trainsize-%.txt: tatoeba-%.md @@ -120,11 +136,86 @@ tatoeba-%.md: + +############################################################################### +## evaluate multilingual models for various language pairs +############################################################################### + + +tatoeba-multilingual-train: + for s in ${SRCLANGS}; do \ + for t in ${TRGLANGS}; do \ + if [ $$s \< $$t ]; then \ + ${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-data; \ + else \ + ${MAKE} SRCLANGS=$$t TRGLANGS=$$s tatoeba-data; \ + fi + done \ + done + ${MAKE} tatoeba-job + + +## evaluate all individual language pairs for a multilingual model +tatoeba-multilingual-eval: + ${MAKE} tatoeba-multilingual-testsets + for s in ${SRCLANGS}; do \ + for t in ${TRGLANGS}; do \ + if [ -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src ]; then \ + ${MAKE} SRC=$$s TRG=$$t \ + TRAINSET=Tatoeba-train \ + DEVSET=Tatoeba-dev \ + TESTSET=Tatoeba-test.$$s-$$t \ + TESTSET_NAME=Tatoeba-test.$$s-$$t \ + USE_REST_DEVDATA=0 \ + HELDOUTSIZE=0 \ + DEVSIZE=5000 \ + TESTSIZE=10000 \ + DEVMINSIZE=200 \ + WORKHOME=${TATOEBA_WORK} \ + USE_TARGET_LABELS=1 \ + compare; \ + fi \ + done \ + done + +# print-info: + + +## copy testsets into the multilingual test directory +tatoeba-multilingual-testsets: + for s in ${SRCLANGS}; do \ + for t in ${TRGLANGS}; do \ + if [ ! -e ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src ]; then \ + wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \ + if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ]; then \ + echo "make Tatoeba-test.$$s-$$t"; \ + cut -f2,3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt | sed 's/^\([^ ]*\) />>\1<< /' \ + > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src; \ + cut -f4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt \ + > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.trg; \ + else \ + wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \ + if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt ]; then \ + echo "make Tatoeba-test.$$s-$$t"; \ + cut -f1,4 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt | sed 's/^\([^ ]*\) />>\1<< /' \ + > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.src; \ + cut -f3 ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt \ + > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.trg; \ + fi \ + fi; \ + rm -f ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-test.$$s-$$t.txt; \ + fi \ + done \ + done + + + +############################################################################### +## generic targets for tatoba models +############################################################################### + + ## generic target for tatoeba challenge jobs -# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz -# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \ -# ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels -# %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels ${MAKE} TRAINSET=Tatoeba-train \ DEVSET=Tatoeba-dev \ @@ -139,11 +230,15 @@ tatoeba-%.md: WORKHOME=${TATOEBA_WORK} \ SRCLANGS="${shell cat $< | sed 's/ *$$//'}" \ TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels) | sed 's/ *$$//'}" \ - LANGPAIRSTR=${LANGPAIRSTR} \ EMAIL= \ ${@:-tatoeba=} + +## all language labels in all language pairs +## (each language pair may include several language variants) +## --> this is necessary to set the languages that are present in a model + ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels: for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ @@ -187,6 +282,10 @@ ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels: +############################################################################### +## generate data files +############################################################################### + ## don't delete those files .SECONDARY: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \ @@ -322,3 +421,59 @@ FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/ %/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz echo "done!" + + +## make Tatoeba test files available in testset collection +## --> useful for testing various languages when creating multilingual models +testsets/${LANGPAIR}/Tatoeba-test.${LANGPAIR}.%: ${TATOEBA_DATA}/Tatoeba-test.${LANGPAIR}.clean.% + mkdir -p ${dir $@} + cp $< $@ + + + +tatoeba-results%.md: tatoeba-results% + echo "# Tatoeba translation results" >$@ + echo "" >>$@ + echo "| Model | LangPair | Score | Details |" >> $@ + echo "|-----------------:|------------|-----------:|---------:|" >> $@ + cat $< | sed 's/ / | /g;s/^/| /;s/$$/ |/' >> $@ + +tatoeba-results-BLEU-sorted: + grep BLEU work-tatoeba/*/*eval | \ + sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' |sort -k3,3n | \ + sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \ + sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | grep -v eval > $@ + +tatoeba-results-BLEU-sorted-model: + grep BLEU work-tatoeba/*/*eval | \ + sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \ + sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \ + sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \ + grep -v eval | sort -k1,1 -k3,3n > $@ + +tatoeba-results-BLEU-sorted-langpair: + grep BLEU work-tatoeba/*/*eval | \ + sed 's/BLEU.*1.4.2//' | cut -f2- -d'/' | \ + sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \ + sed 's#.eval: = #\t#' | sed 's/\([0-9]\) /\1 /' | \ + grep -v eval | sort -k2,2 -k3,3n > $@ + +tatoeba-results-chrF2-sorted: + grep chrF2 work-tatoeba/*/*eval | \ + sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' |sort -k3,3n | \ + sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \ + sed 's#.eval: = #\t#' > $@ + +tatoeba-results-chrF2-sorted-model: + grep chrF2 work-tatoeba/*/*eval | \ + sed 's/chrF.*1.4.2//' | cut -f2- -d'/' | \ + sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \ + sed 's#.eval: = #\t#' | sort -k1,1 -k3,3n > $@ + +tatoeba-results-chrF2-sorted-langpair: + grep chrF2 work-tatoeba/*/*eval | \ + sed 's/chrF2.*1.4.2//' | cut -f2- -d'/' | \ + sed 's/Tatoeba.*-align//' | sed "s#/.#\t#" | \ + sed 's#.eval: = #\t#' | sort -k2,2 -k3,3n > $@ + + diff --git a/lib/sentencepiece.mk b/lib/sentencepiece.mk index f117c774..134c5eb7 100644 --- a/lib/sentencepiece.mk +++ b/lib/sentencepiece.mk @@ -13,6 +13,8 @@ # ---> don't need to re-create models for each language pair # +.INTERMEDIATE: ${LOCAL_MONO_DATA}.${PRE}.charfreq +.INTERMEDIATE: ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq ##---------------------------------------------- ## sentence piece @@ -240,7 +242,7 @@ endif ## document-level models (with guided alignment) %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz: ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k wordalign - ./large-context.pl -l ${CONTEXT_SIZE} \ + ${SCRIPTDIR}/large-context.pl -l ${CONTEXT_SIZE} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.src.spm${SRCBPESIZE:000=}k.gz,$@} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.trg.spm${TRGBPESIZE:000=}k.gz,$@} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}.gz,%.spm${SRCBPESIZE:000=}k-spm${TRGBPESIZE:000=}k.src-trg.alg.gz,$@} \ diff --git a/large-context.pl b/scripts/large-context.pl similarity index 100% rename from large-context.pl rename to scripts/large-context.pl diff --git a/verify-wordalign.pl b/scripts/verify-wordalign.pl old mode 100644 new mode 100755 similarity index 100% rename from verify-wordalign.pl rename to scripts/verify-wordalign.pl