mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
fixed a problem with langlabel files
This commit is contained in:
parent
72e1bcb7ec
commit
6db5b3b716
@ -222,7 +222,7 @@ tatoeba-print-reliable-trg:
|
||||
@echo ${TATOEBA_RELIABLE_TRG}
|
||||
|
||||
|
||||
# RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
|
||||
RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
|
||||
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
|
||||
|
||||
fetch-bt:
|
||||
@ -232,6 +232,13 @@ fetch-bt:
|
||||
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
|
||||
done
|
||||
|
||||
fetch-all-bt:
|
||||
for d in ${RELEASED_BT_ALL}; do \
|
||||
echo "fetch $$d"; \
|
||||
mkdir -p `dirname $$d`; \
|
||||
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
|
||||
done
|
||||
|
||||
|
||||
#---------------------------------------------------------------
|
||||
# release data
|
||||
|
@ -112,8 +112,10 @@ MAX_OVER_SAMPLING ?= 50
|
||||
|
||||
# sorted languages and langpair used to match resources in OPUS
|
||||
SORTLANGS = $(sort ${SRC} ${TRG})
|
||||
SORTSRC = ${firstword ${SORTLANGS}}
|
||||
SORTTRG = ${lastword ${SORTLANGS}}
|
||||
LANGPAIR = ${SORTSRC}-${SORTTRG}
|
||||
SPACE = $(empty) $(empty)
|
||||
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
||||
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
|
||||
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
|
||||
@ -128,11 +130,15 @@ LANGSTR ?= ${subst ${SPACE},+,$(LANGS)}
|
||||
## for same language pairs: add numeric extension
|
||||
## (this is neccessary to keep source and target files separate)
|
||||
ifeq (${SRC},$(TRG))
|
||||
SRCEXT = ${SRC}1
|
||||
TRGEXT = ${SRC}2
|
||||
SRCEXT = ${SRC}1
|
||||
TRGEXT = ${SRC}2
|
||||
SORTSRCEXT = ${SORTSRC}1
|
||||
SORTTRGEXT = ${SORTSRC}2
|
||||
else
|
||||
SRCEXT = ${SRC}
|
||||
TRGEXT = ${TRG}
|
||||
SRCEXT = ${SRC}
|
||||
TRGEXT = ${TRG}
|
||||
SORTSRCEXT = ${SORTSRC}
|
||||
SORTTRGEXT = ${SORTTRG}
|
||||
endif
|
||||
|
||||
## set a flag to use target language labels
|
||||
|
@ -422,7 +422,7 @@ ifdef SHUFFLE_DATA
|
||||
endif
|
||||
######################################
|
||||
# FIT_DATA_SIZE is set?
|
||||
# --> fit data to speciic size
|
||||
# --> fit data to specific size
|
||||
# --> under/over sampling!
|
||||
######################################
|
||||
@echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
|
@ -121,7 +121,8 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
|
||||
LOADGPU = module load ${GPU_MODULES}
|
||||
LOADMODS = ${LOADGPU}
|
||||
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
CSCPROJECT = project_2002688
|
||||
# CSCPROJECT = project_2002688
|
||||
CSCPROJECT = project_2000309
|
||||
# CSCPROJECT = project_2002982
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
APPLHOME = /projappl/project_2001194
|
||||
|
@ -50,6 +50,12 @@ fi-zh:
|
||||
train-dynamic.submitcpu
|
||||
|
||||
|
||||
|
||||
# Tatoeba: more than 100 test sentences:
|
||||
# ain dan deu eng enm epo est fkv fra heb hun ita jpn kor kur lat lit nld nor pol por rus spa swe tur zho
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
# add THL backtranslation data (and also all other backtranslations)
|
||||
#-------------------------------------------------------------------
|
||||
|
@ -3,10 +3,6 @@
|
||||
# Makefile for running models with data from the Tatoeba Translation Challenge
|
||||
# https://github.com/Helsinki-NLP/Tatoeba-Challenge
|
||||
#
|
||||
# NEWS
|
||||
#
|
||||
# - MODELTYPE=transformer is now default for all Tatoeba models
|
||||
# (no guided alignment!)
|
||||
#
|
||||
#---------------------------------------------------------------------
|
||||
# train and evaluate a single translation pair, for example:
|
||||
@ -90,8 +86,7 @@
|
||||
|
||||
## general parameters for Tatoeba models
|
||||
|
||||
|
||||
## NEW: release
|
||||
## Tatoeba Challenge Data release number
|
||||
# TATOEBA_VERSION ?= v2020-07-28
|
||||
TATOEBA_VERSION ?= v2021-08-07
|
||||
|
||||
@ -99,17 +94,17 @@ TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
|
||||
TATOEBA_TEST_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
|
||||
TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
|
||||
TATOEBA_MONO_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
|
||||
# TATOEBA_TEST_URL := ${TATOEBA_DATAURL}
|
||||
# TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}
|
||||
# TATOEBA_MONO_URL := ${TATOEBA_DATAURL}
|
||||
TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_WORK ?= ${PWD}/work-tatoeba
|
||||
TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE}
|
||||
TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono
|
||||
|
||||
# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION}
|
||||
|
||||
|
||||
## data count files (file basename)
|
||||
TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT}/data/release/${TATOEBA_VERSION}/released-bitexts
|
||||
TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/released-bitexts
|
||||
|
||||
## all released language pairs with test sets > 200 test pairs
|
||||
## also extract all source languages that are available for a give target language
|
||||
@ -132,7 +127,7 @@ WIKIMACROLANGS ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}})
|
||||
TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models
|
||||
|
||||
## this will be the base name of the model file
|
||||
TATOEBA_DATASET := opusTC$(subst -,,${TATOEBA_VERSION})
|
||||
TATOEBA_DATASET := ${DATASET}TC$(subst -,,${TATOEBA_VERSION})
|
||||
|
||||
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
|
||||
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
|
||||
@ -525,21 +520,21 @@ all-tatoeba-langgroups:
|
||||
|
||||
#### language-group to English
|
||||
|
||||
GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-trainjob,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_EVAL := $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_EVALALL := $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
|
||||
GROUP2ENG_DIST := $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
|
||||
|
||||
#### English to language group
|
||||
|
||||
ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-trainjob,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_EVAL := $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_EVALALL := $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
|
||||
ENG2GROUP_DIST := $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
|
||||
|
||||
#### multilingual language-group (bi-directional
|
||||
|
||||
LANGGROUP_TRAIN := $(foreach G,${OPUS_LANG_GROUPS},tatoeba-${G}2${G}-train)
|
||||
LANGGROUP_TRAIN := $(foreach G,${OPUS_LANG_GROUPS},tatoeba-${G}2${G}-trainjob)
|
||||
LANGGROUP_EVAL := $(patsubst %-train,%-eval,${LANGGROUP_TRAIN})
|
||||
LANGGROUP_EVALALL := $(patsubst %-train,%-evalall,${LANGGROUP_TRAIN})
|
||||
LANGGROUP_DIST := $(patsubst %-train,%-dist,${LANGGROUP_TRAIN})
|
||||
@ -547,17 +542,17 @@ LANGGROUP_DIST := $(patsubst %-train,%-dist,${LANGGROUP_TRAIN})
|
||||
LANGGROUP_FIT_DATA_SIZE=1000000
|
||||
|
||||
## start all jobs with 1 million sampled sentence pairs per language pair
|
||||
## (OLD: MODELTYPE=transformer)
|
||||
all-tatoeba-group2eng:
|
||||
${MAKE} MIN_SRCLANGS=2 MODELTYPE=transformer \
|
||||
${MAKE} MIN_SRCLANGS=2 SKIP_LANGPAIRS="eng-eng" \
|
||||
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${GROUP2ENG_TRAIN}
|
||||
|
||||
all-tatoeba-eng2group:
|
||||
${MAKE} MIN_TRGLANGS=2 MODELTYPE=transformer \
|
||||
${MAKE} MIN_TRGLANGS=2 SKIP_LANGPAIRS="eng-eng" \
|
||||
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${ENG2GROUP_TRAIN}
|
||||
|
||||
all-tatoeba-langgroup:
|
||||
${MAKE} MIN_SRCLANGS=2 MAX_SRCLANGS=30 PIVOT=eng \
|
||||
MODELTYPE=transformer \
|
||||
${MAKE} MIN_SRCLANGS=2 MAX_SRCLANGS=30 PIVOT=eng SKIP_LANGPAIRS="eng-eng" \
|
||||
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${LANGGROUP_TRAIN}
|
||||
|
||||
all-tatoeba-cross-langgroups:
|
||||
@ -566,7 +561,6 @@ all-tatoeba-cross-langgroups:
|
||||
if [ "$$s" != "$$t" ]; then \
|
||||
${MAKE} MIN_SRCLANGS=2 MIN_TRGLANGS=2 \
|
||||
MAX_SRCLANGS=30 MAX_TRGLANGS=30 \
|
||||
MODELTYPE=transformer \
|
||||
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} \
|
||||
tatoeba-$${s}2$${t}-train; \
|
||||
fi \
|
||||
@ -787,6 +781,10 @@ tatoeba-%-train:
|
||||
fi \
|
||||
fi )
|
||||
|
||||
test-fiu2eng:
|
||||
echo "${call find-srclanggroup,${patsubst test-%,%,$@},${PIVOT}}"
|
||||
echo "${call find-trglanggroup,${patsubst test-%,%,$@},${PIVOT}}"
|
||||
|
||||
|
||||
## start the training job
|
||||
## - create config file
|
||||
@ -1129,7 +1127,7 @@ tatoeba-%-langtunealljobs:
|
||||
|
||||
## get the markdown page for a specific subset
|
||||
tatoeba-%.md:
|
||||
wget -O $@ ${TATOEBA_RAWGIT}/subsets/${patsubst tatoeba-%,%,$@}
|
||||
wget -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@}
|
||||
|
||||
|
||||
## run all language pairs for a given subset
|
||||
@ -1340,7 +1338,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done:
|
||||
@for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
|
||||
${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \
|
||||
${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \
|
||||
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
|
||||
cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
|
||||
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \
|
||||
@ -1387,7 +1385,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done:
|
||||
fi; \
|
||||
else \
|
||||
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
|
||||
${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \
|
||||
${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \
|
||||
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
|
||||
cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
|
||||
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \
|
||||
@ -1726,7 +1724,7 @@ KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur
|
||||
nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
|
||||
syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
|
||||
SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \
|
||||
ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn heb_Latn nob_Hebr rus_Latn
|
||||
ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn eng_Tibt eng_Zinh heb_Latn hun_Zinh nob_Hebr rus_Latn
|
||||
SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
|
||||
|
||||
## modify language IDs in training data to adjust them to test sets
|
||||
@ -1738,6 +1736,9 @@ SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
|
||||
FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_TW/cmn\1/g;s/zho/cmn/g;' \
|
||||
| sed 's/\_[A-Z][A-Z]//g' \
|
||||
| sed 's/\-[a-z]*//g' \
|
||||
| sed 's/\_Brai//g' \
|
||||
| sed 's/\_Zinh//g' \
|
||||
| sed 's/\_Tibt//g' \
|
||||
| sed 's/jpn_[A-Za-z]*/jpn/g' \
|
||||
| sed 's/kor_[A-Za-z]*/kor/g' \
|
||||
| sed 's/nor_Latn/nor/g' \
|
||||
@ -1807,7 +1808,8 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
|
||||
mv $@.d/${TATOEBA_TMPDATADIR}/test.trg ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
cat $@.d/${TATOEBA_TMPDATADIR}/test.id $(FIXLANGIDS) > ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.id; \
|
||||
fi
|
||||
@if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \
|
||||
@if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ] && \
|
||||
[ `cat $@.d/${TATOEBA_TMPDATADIR}/dev.src | wc -l` -gt 50 ]; then \
|
||||
echo "........ move dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \
|
||||
mv $@.d/${TATOEBA_TMPDATADIR}/dev.src ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
mv $@.d/${TATOEBA_TMPDATADIR}/dev.trg ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
@ -1821,7 +1823,7 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
|
||||
fi; \
|
||||
else \
|
||||
if [ -e $@.d/${TATOEBA_TMPDATADIR}/train.src.gz ]; then \
|
||||
echo "no devdata available - get top 1000 from training data!"; \
|
||||
echo "........ too little devdata available - get top 1000 from training data!"; \
|
||||
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.src.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \
|
||||
@ -1830,6 +1832,12 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
|
||||
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | tail -n +1001 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.id; \
|
||||
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f1 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.domain; \
|
||||
fi; \
|
||||
if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \
|
||||
echo "........ add dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \
|
||||
cat $@.d/${TATOEBA_TMPDATADIR}/dev.src >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
cat $@.d/${TATOEBA_TMPDATADIR}/dev.trg >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
cat $@.d/${TATOEBA_TMPDATADIR}/dev.id $(FIXLANGIDS) >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \
|
||||
fi \
|
||||
fi
|
||||
## make sure that training data file exists even if it is empty
|
||||
@ -1842,10 +1850,10 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
|
||||
#######################################
|
||||
@cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \
|
||||
grep -v '${SKIP_LANGIDS_PATTERN}' | \
|
||||
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
|
||||
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTSRCEXT}.labels)
|
||||
@cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \
|
||||
grep -v '${SKIP_LANGIDS_PATTERN}' | \
|
||||
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
|
||||
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTTRGEXT}.labels)
|
||||
@cat ${dir $@}Tatoeba-*.${LANGPAIR}.clean.domain | sort -u |\
|
||||
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.domains)
|
||||
#######################################
|
||||
|
@ -13,7 +13,9 @@ ifeq (${SUBWORDS},spm)
|
||||
|
||||
${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
ifneq (${MODEL_LATEST_VOCAB},)
|
||||
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
endif
|
||||
else
|
||||
cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab}
|
||||
cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab}
|
||||
@ -39,7 +41,9 @@ ${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
|
||||
ifeq ($(wildcard ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}),)
|
||||
ifneq (${MODEL_LATEST_VOCAB},)
|
||||
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
endif
|
||||
else
|
||||
mkdir -p ${dir $@}
|
||||
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
|
||||
@ -184,11 +188,15 @@ ${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
|
||||
ifeq (${wildcard ${MODEL_START}},)
|
||||
ifneq (${MODEL_LATEST},)
|
||||
ifneq (${MODEL_LATEST_VOCAB},)
|
||||
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
endif
|
||||
ifneq (${MODEL_LATEST},${MODEL_START})
|
||||
cp ${MODEL_LATEST} ${MODEL_START}
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
##--------------------------------------------------------------------
|
||||
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
|
||||
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
|
||||
|
124
scripts/evaluate/check-overlap.pl
Executable file
124
scripts/evaluate/check-overlap.pl
Executable file
@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use utf8;
|
||||
use strict;
|
||||
use open qw/:std :utf8/;
|
||||
use Getopt::Long;
|
||||
|
||||
my $AlphaOnly = 0;
|
||||
my $LowerCase = 0;
|
||||
my $DecodeSpm = 0;
|
||||
my $verbose = 0;
|
||||
|
||||
GetOptions(
|
||||
"alpha|a" => \$AlphaOnly,
|
||||
"lower-case|l" => \$LowerCase,
|
||||
"decode-spm|d" => \$DecodeSpm,
|
||||
"verbose|v" => \$verbose );
|
||||
|
||||
my $BigSrcFile = shift(@ARGV);
|
||||
my $BigTrgFile = shift(@ARGV);
|
||||
|
||||
my %SrcSents = ();
|
||||
my %TrgSents = ();
|
||||
my %SentPairs = ();
|
||||
|
||||
|
||||
|
||||
while (@ARGV){
|
||||
my $SrcFile = shift(@ARGV);
|
||||
my $TrgFile = shift(@ARGV);
|
||||
read_pairs($SrcFile,$TrgFile);
|
||||
}
|
||||
|
||||
|
||||
my $S = open_file($BigSrcFile);
|
||||
my $T = open_file($BigTrgFile);
|
||||
|
||||
|
||||
my $total = 0;
|
||||
my ($SrcExists,$TrgExists,$PairExists) = (0,0,0);
|
||||
my %SrcUniqueExists = ();
|
||||
my %TrgUniqueExists = ();
|
||||
my %PairUniqueExists = ();
|
||||
|
||||
|
||||
while (<$S>){
|
||||
my $trg = <$T>;
|
||||
&normalise($_);
|
||||
&normalise($trg);
|
||||
$total++;
|
||||
if (exists $SrcSents{$_}){
|
||||
$SrcExists++;
|
||||
$SrcUniqueExists{$_}++;
|
||||
}
|
||||
if (exists $TrgSents{$trg}){
|
||||
$TrgExists++;
|
||||
$TrgUniqueExists{$trg}++;
|
||||
}
|
||||
if (exists $SentPairs{"$_\t$trg"}){
|
||||
$PairExists++;
|
||||
chomp;
|
||||
unless (exists $PairUniqueExists{"$_\t$trg"}){
|
||||
print STDERR "exists: $_\t$trg\n" if ($verbose);
|
||||
$PairUniqueExists{"$_\t$trg"}++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
my $TotalSmall = scalar keys %SentPairs;
|
||||
if ($total){
|
||||
printf "source sentences from train found in devtest\t%d\t%5.2f\%\n",$SrcExists,100*$SrcExists/$total;
|
||||
printf "target sentences from train found in devtest\t%d\t%5.2f\%\n",$TrgExists,100*$TrgExists/$total;
|
||||
printf " sentence pairs from train found in devtest\t%d\t%5.2f\%\n",$PairExists,100*$PairExists/$total;
|
||||
print "total size of training data\t",$total,"\n";
|
||||
}
|
||||
if ($TotalSmall){
|
||||
my $SrcExistsSmall = scalar keys %SrcUniqueExists;
|
||||
my $TrgExistsSmall = scalar keys %TrgUniqueExists;
|
||||
my $PairExistsSmall = scalar keys %PairUniqueExists;
|
||||
printf "source sentences from devtest found in train\t%d\t%5.2f\%\n",$SrcExistsSmall,100*$SrcExistsSmall/$TotalSmall;
|
||||
printf "target sentences from devtest found in train\t%d\t%5.2f\%\n",$TrgExistsSmall,100*$TrgExistsSmall/$TotalSmall;
|
||||
printf " sentence pairs from devtest found in train\t%d\t%5.2f\%\n",$PairExistsSmall,100*$PairExistsSmall/$TotalSmall;
|
||||
print "total size of devtest data\t",$TotalSmall,"\n";
|
||||
}
|
||||
|
||||
|
||||
sub read_pairs{
|
||||
my ($SrcFile,$TrgFile) = @_;
|
||||
my $S = open_file($SrcFile);
|
||||
my $T = open_file($TrgFile);
|
||||
while (<$S>){
|
||||
my $trg = <$T>;
|
||||
&normalise($_);
|
||||
&normalise($trg);
|
||||
$SrcSents{$_} = 1;
|
||||
$TrgSents{$trg} = 1;
|
||||
$SentPairs{"$_\t$trg"} = 1;
|
||||
}
|
||||
close $S;
|
||||
close $T;
|
||||
}
|
||||
|
||||
|
||||
sub open_file{
|
||||
my $file = shift;
|
||||
my $handle;
|
||||
if ($file=~/\.gz$/){
|
||||
open $handle,"gzip -cd <$file |" || die "cannot open $file\n";
|
||||
return $handle;
|
||||
}
|
||||
open $handle,"<$file" || die "cannot open $file\n";
|
||||
return $handle;
|
||||
}
|
||||
|
||||
|
||||
sub normalise{
|
||||
$_[0]=~s/\P{IsAlpha}//gs if ($AlphaOnly);
|
||||
$_[0] = lc($_[0]) if ($LowerCase);
|
||||
if ($DecodeSpm){
|
||||
if ($_[0]=~s/▁/ /g){
|
||||
$_[0]=~s/ //g;
|
||||
}
|
||||
}
|
||||
}
|
1
testsets/en-kk/newsdev2019-enkk.kaz_Cyrl.gz
Symbolic link
1
testsets/en-kk/newsdev2019-enkk.kaz_Cyrl.gz
Symbolic link
@ -0,0 +1 @@
|
||||
newsdev2019-enkk.kaz.gz
|
1
testsets/en-kk/newstest2019-enkk.kaz_Cyrl.gz
Symbolic link
1
testsets/en-kk/newstest2019-enkk.kaz_Cyrl.gz
Symbolic link
@ -0,0 +1 @@
|
||||
newstest2019-enkk.kaz.gz
|
1
testsets/eng-kaz_Cyrl
Symbolic link
1
testsets/eng-kaz_Cyrl
Symbolic link
@ -0,0 +1 @@
|
||||
eng-kaz
|
1
testsets/fi-de/goethe-institute-test1.de.gz
Symbolic link
1
testsets/fi-de/goethe-institute-test1.de.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../de-fi/goethe-institute-test1.de.gz
|
1
testsets/fi-de/goethe-institute-test1.fi.gz
Symbolic link
1
testsets/fi-de/goethe-institute-test1.fi.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../de-fi/goethe-institute-test1.fi.gz
|
1
testsets/fi-de/goethe-institute-test2.de.gz
Symbolic link
1
testsets/fi-de/goethe-institute-test2.de.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../de-fi/goethe-institute-test2.de.gz
|
1
testsets/fi-de/goethe-institute-test2.fi.gz
Symbolic link
1
testsets/fi-de/goethe-institute-test2.fi.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../de-fi/goethe-institute-test2.fi.gz
|
1
testsets/fin-deu/goethe-institute-test1.deu.gz
Symbolic link
1
testsets/fin-deu/goethe-institute-test1.deu.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../deu-fin/goethe-institute-test1.deu.gz
|
1
testsets/fin-deu/goethe-institute-test1.fin.gz
Symbolic link
1
testsets/fin-deu/goethe-institute-test1.fin.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../deu-fin/goethe-institute-test1.fin.gz
|
1
testsets/fin-deu/goethe-institute-test2.deu.gz
Symbolic link
1
testsets/fin-deu/goethe-institute-test2.deu.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../deu-fin/goethe-institute-test2.deu.gz
|
1
testsets/fin-deu/goethe-institute-test2.fin.gz
Symbolic link
1
testsets/fin-deu/goethe-institute-test2.fin.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../deu-fin/goethe-institute-test2.fin.gz
|
Loading…
Reference in New Issue
Block a user