fixed a problem with langlabel files

This commit is contained in:
Joerg Tiedemann 2021-09-13 00:07:51 +03:00
parent 72e1bcb7ec
commit 6db5b3b716
19 changed files with 207 additions and 36 deletions

View File

@ -222,7 +222,7 @@ tatoeba-print-reliable-trg:
@echo ${TATOEBA_RELIABLE_TRG}
# RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}}
RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'}
fetch-bt:
@ -232,6 +232,13 @@ fetch-bt:
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
fetch-all-bt:
for d in ${RELEASED_BT_ALL}; do \
echo "fetch $$d"; \
mkdir -p `dirname $$d`; \
wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \
done
#---------------------------------------------------------------
# release data

View File

@ -112,8 +112,10 @@ MAX_OVER_SAMPLING ?= 50
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
SORTSRC = ${firstword ${SORTLANGS}}
SORTTRG = ${lastword ${SORTLANGS}}
LANGPAIR = ${SORTSRC}-${SORTTRG}
SPACE = $(empty) $(empty)
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
@ -128,11 +130,15 @@ LANGSTR ?= ${subst ${SPACE},+,$(LANGS)}
## for same language pairs: add numeric extension
## (this is neccessary to keep source and target files separate)
ifeq (${SRC},$(TRG))
SRCEXT = ${SRC}1
TRGEXT = ${SRC}2
SRCEXT = ${SRC}1
TRGEXT = ${SRC}2
SORTSRCEXT = ${SORTSRC}1
SORTTRGEXT = ${SORTSRC}2
else
SRCEXT = ${SRC}
TRGEXT = ${TRG}
SRCEXT = ${SRC}
TRGEXT = ${TRG}
SORTSRCEXT = ${SORTSRC}
SORTTRGEXT = ${SORTTRG}
endif
## set a flag to use target language labels

View File

@ -422,7 +422,7 @@ ifdef SHUFFLE_DATA
endif
######################################
# FIT_DATA_SIZE is set?
# --> fit data to speciic size
# --> fit data to specific size
# --> under/over sampling!
######################################
@echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md

View File

@ -121,7 +121,8 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
LOADGPU = module load ${GPU_MODULES}
LOADMODS = ${LOADGPU}
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
CSCPROJECT = project_2002688
# CSCPROJECT = project_2002688
CSCPROJECT = project_2000309
# CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
APPLHOME = /projappl/project_2001194

View File

@ -50,6 +50,12 @@ fi-zh:
train-dynamic.submitcpu
# Tatoeba: more than 100 test sentences:
# ain dan deu eng enm epo est fkv fra heb hun ita jpn kor kur lat lit nld nor pol por rus spa swe tur zho
#-------------------------------------------------------------------
# add THL backtranslation data (and also all other backtranslations)
#-------------------------------------------------------------------

View File

@ -3,10 +3,6 @@
# Makefile for running models with data from the Tatoeba Translation Challenge
# https://github.com/Helsinki-NLP/Tatoeba-Challenge
#
# NEWS
#
# - MODELTYPE=transformer is now default for all Tatoeba models
# (no guided alignment!)
#
#---------------------------------------------------------------------
# train and evaluate a single translation pair, for example:
@ -90,8 +86,7 @@
## general parameters for Tatoeba models
## NEW: release
## Tatoeba Challenge Data release number
# TATOEBA_VERSION ?= v2020-07-28
TATOEBA_VERSION ?= v2021-08-07
@ -99,17 +94,17 @@ TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
TATOEBA_TEST_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
TATOEBA_MONO_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
# TATOEBA_TEST_URL := ${TATOEBA_DATAURL}
# TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}
# TATOEBA_MONO_URL := ${TATOEBA_DATAURL}
TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_WORK ?= ${PWD}/work-tatoeba
TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE}
TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono
# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION}
## data count files (file basename)
TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT}/data/release/${TATOEBA_VERSION}/released-bitexts
TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/released-bitexts
## all released language pairs with test sets > 200 test pairs
## also extract all source languages that are available for a give target language
@ -132,7 +127,7 @@ WIKIMACROLANGS ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}})
TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models
## this will be the base name of the model file
TATOEBA_DATASET := opusTC$(subst -,,${TATOEBA_VERSION})
TATOEBA_DATASET := ${DATASET}TC$(subst -,,${TATOEBA_VERSION})
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
@ -525,21 +520,21 @@ all-tatoeba-langgroups:
#### language-group to English
GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-trainjob,${OPUS_LANG_GROUPS})
GROUP2ENG_EVAL := $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
GROUP2ENG_EVALALL := $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
GROUP2ENG_DIST := $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
#### English to language group
ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-trainjob,${OPUS_LANG_GROUPS})
ENG2GROUP_EVAL := $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
ENG2GROUP_EVALALL := $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
ENG2GROUP_DIST := $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
#### multilingual language-group (bi-directional
LANGGROUP_TRAIN := $(foreach G,${OPUS_LANG_GROUPS},tatoeba-${G}2${G}-train)
LANGGROUP_TRAIN := $(foreach G,${OPUS_LANG_GROUPS},tatoeba-${G}2${G}-trainjob)
LANGGROUP_EVAL := $(patsubst %-train,%-eval,${LANGGROUP_TRAIN})
LANGGROUP_EVALALL := $(patsubst %-train,%-evalall,${LANGGROUP_TRAIN})
LANGGROUP_DIST := $(patsubst %-train,%-dist,${LANGGROUP_TRAIN})
@ -547,17 +542,17 @@ LANGGROUP_DIST := $(patsubst %-train,%-dist,${LANGGROUP_TRAIN})
LANGGROUP_FIT_DATA_SIZE=1000000
## start all jobs with 1 million sampled sentence pairs per language pair
## (OLD: MODELTYPE=transformer)
all-tatoeba-group2eng:
${MAKE} MIN_SRCLANGS=2 MODELTYPE=transformer \
${MAKE} MIN_SRCLANGS=2 SKIP_LANGPAIRS="eng-eng" \
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${GROUP2ENG_TRAIN}
all-tatoeba-eng2group:
${MAKE} MIN_TRGLANGS=2 MODELTYPE=transformer \
${MAKE} MIN_TRGLANGS=2 SKIP_LANGPAIRS="eng-eng" \
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${ENG2GROUP_TRAIN}
all-tatoeba-langgroup:
${MAKE} MIN_SRCLANGS=2 MAX_SRCLANGS=30 PIVOT=eng \
MODELTYPE=transformer \
${MAKE} MIN_SRCLANGS=2 MAX_SRCLANGS=30 PIVOT=eng SKIP_LANGPAIRS="eng-eng" \
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${LANGGROUP_TRAIN}
all-tatoeba-cross-langgroups:
@ -566,7 +561,6 @@ all-tatoeba-cross-langgroups:
if [ "$$s" != "$$t" ]; then \
${MAKE} MIN_SRCLANGS=2 MIN_TRGLANGS=2 \
MAX_SRCLANGS=30 MAX_TRGLANGS=30 \
MODELTYPE=transformer \
FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} \
tatoeba-$${s}2$${t}-train; \
fi \
@ -787,6 +781,10 @@ tatoeba-%-train:
fi \
fi )
test-fiu2eng:
echo "${call find-srclanggroup,${patsubst test-%,%,$@},${PIVOT}}"
echo "${call find-trglanggroup,${patsubst test-%,%,$@},${PIVOT}}"
## start the training job
## - create config file
@ -1129,7 +1127,7 @@ tatoeba-%-langtunealljobs:
## get the markdown page for a specific subset
tatoeba-%.md:
wget -O $@ ${TATOEBA_RAWGIT}/subsets/${patsubst tatoeba-%,%,$@}
wget -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@}
## run all language pairs for a given subset
@ -1340,7 +1338,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done:
@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \
${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \
@ -1387,7 +1385,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done:
fi; \
else \
wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \
${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \
${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \
if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \
cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \
> ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \
@ -1726,7 +1724,7 @@ KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur
nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \
ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn heb_Latn nob_Hebr rus_Latn
ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn eng_Tibt eng_Zinh heb_Latn hun_Zinh nob_Hebr rus_Latn
SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
## modify language IDs in training data to adjust them to test sets
@ -1738,6 +1736,9 @@ SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_TW/cmn\1/g;s/zho/cmn/g;' \
| sed 's/\_[A-Z][A-Z]//g' \
| sed 's/\-[a-z]*//g' \
| sed 's/\_Brai//g' \
| sed 's/\_Zinh//g' \
| sed 's/\_Tibt//g' \
| sed 's/jpn_[A-Za-z]*/jpn/g' \
| sed 's/kor_[A-Za-z]*/kor/g' \
| sed 's/nor_Latn/nor/g' \
@ -1807,7 +1808,8 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
mv $@.d/${TATOEBA_TMPDATADIR}/test.trg ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.${TRGEXT}; \
cat $@.d/${TATOEBA_TMPDATADIR}/test.id $(FIXLANGIDS) > ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.id; \
fi
@if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \
@if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ] && \
[ `cat $@.d/${TATOEBA_TMPDATADIR}/dev.src | wc -l` -gt 50 ]; then \
echo "........ move dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \
mv $@.d/${TATOEBA_TMPDATADIR}/dev.src ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \
mv $@.d/${TATOEBA_TMPDATADIR}/dev.trg ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \
@ -1821,7 +1823,7 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
fi; \
else \
if [ -e $@.d/${TATOEBA_TMPDATADIR}/train.src.gz ]; then \
echo "no devdata available - get top 1000 from training data!"; \
echo "........ too little devdata available - get top 1000 from training data!"; \
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.src.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \
@ -1830,6 +1832,12 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | tail -n +1001 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}; \
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.id; \
${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f1 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.domain; \
fi; \
if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \
echo "........ add dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \
cat $@.d/${TATOEBA_TMPDATADIR}/dev.src >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \
cat $@.d/${TATOEBA_TMPDATADIR}/dev.trg >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \
cat $@.d/${TATOEBA_TMPDATADIR}/dev.id $(FIXLANGIDS) >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \
fi \
fi
## make sure that training data file exists even if it is empty
@ -1842,10 +1850,10 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR}
#######################################
@cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \
grep -v '${SKIP_LANGIDS_PATTERN}' | \
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTSRCEXT}.labels)
@cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \
grep -v '${SKIP_LANGIDS_PATTERN}' | \
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTTRGEXT}.labels)
@cat ${dir $@}Tatoeba-*.${LANGPAIR}.clean.domain | sort -u |\
tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.domains)
#######################################

View File

@ -13,7 +13,9 @@ ifeq (${SUBWORDS},spm)
${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL}
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
else
cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab}
cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab}
@ -39,7 +41,9 @@ ${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
ifeq ($(wildcard ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
else
mkdir -p ${dir $@}
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
@ -184,11 +188,15 @@ ${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
ifeq (${wildcard ${MODEL_START}},)
ifneq (${MODEL_LATEST},)
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
ifneq (${MODEL_LATEST},${MODEL_START})
cp ${MODEL_LATEST} ${MODEL_START}
endif
endif
endif
endif
##--------------------------------------------------------------------
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \

124
scripts/evaluate/check-overlap.pl Executable file
View File

@ -0,0 +1,124 @@
#!/usr/bin/env perl
use utf8;
use strict;
use open qw/:std :utf8/;
use Getopt::Long;
my $AlphaOnly = 0;
my $LowerCase = 0;
my $DecodeSpm = 0;
my $verbose = 0;
GetOptions(
"alpha|a" => \$AlphaOnly,
"lower-case|l" => \$LowerCase,
"decode-spm|d" => \$DecodeSpm,
"verbose|v" => \$verbose );
my $BigSrcFile = shift(@ARGV);
my $BigTrgFile = shift(@ARGV);
my %SrcSents = ();
my %TrgSents = ();
my %SentPairs = ();
while (@ARGV){
my $SrcFile = shift(@ARGV);
my $TrgFile = shift(@ARGV);
read_pairs($SrcFile,$TrgFile);
}
my $S = open_file($BigSrcFile);
my $T = open_file($BigTrgFile);
my $total = 0;
my ($SrcExists,$TrgExists,$PairExists) = (0,0,0);
my %SrcUniqueExists = ();
my %TrgUniqueExists = ();
my %PairUniqueExists = ();
while (<$S>){
my $trg = <$T>;
&normalise($_);
&normalise($trg);
$total++;
if (exists $SrcSents{$_}){
$SrcExists++;
$SrcUniqueExists{$_}++;
}
if (exists $TrgSents{$trg}){
$TrgExists++;
$TrgUniqueExists{$trg}++;
}
if (exists $SentPairs{"$_\t$trg"}){
$PairExists++;
chomp;
unless (exists $PairUniqueExists{"$_\t$trg"}){
print STDERR "exists: $_\t$trg\n" if ($verbose);
$PairUniqueExists{"$_\t$trg"}++;
}
}
}
my $TotalSmall = scalar keys %SentPairs;
if ($total){
printf "source sentences from train found in devtest\t%d\t%5.2f\%\n",$SrcExists,100*$SrcExists/$total;
printf "target sentences from train found in devtest\t%d\t%5.2f\%\n",$TrgExists,100*$TrgExists/$total;
printf " sentence pairs from train found in devtest\t%d\t%5.2f\%\n",$PairExists,100*$PairExists/$total;
print "total size of training data\t",$total,"\n";
}
if ($TotalSmall){
my $SrcExistsSmall = scalar keys %SrcUniqueExists;
my $TrgExistsSmall = scalar keys %TrgUniqueExists;
my $PairExistsSmall = scalar keys %PairUniqueExists;
printf "source sentences from devtest found in train\t%d\t%5.2f\%\n",$SrcExistsSmall,100*$SrcExistsSmall/$TotalSmall;
printf "target sentences from devtest found in train\t%d\t%5.2f\%\n",$TrgExistsSmall,100*$TrgExistsSmall/$TotalSmall;
printf " sentence pairs from devtest found in train\t%d\t%5.2f\%\n",$PairExistsSmall,100*$PairExistsSmall/$TotalSmall;
print "total size of devtest data\t",$TotalSmall,"\n";
}
sub read_pairs{
my ($SrcFile,$TrgFile) = @_;
my $S = open_file($SrcFile);
my $T = open_file($TrgFile);
while (<$S>){
my $trg = <$T>;
&normalise($_);
&normalise($trg);
$SrcSents{$_} = 1;
$TrgSents{$trg} = 1;
$SentPairs{"$_\t$trg"} = 1;
}
close $S;
close $T;
}
sub open_file{
my $file = shift;
my $handle;
if ($file=~/\.gz$/){
open $handle,"gzip -cd <$file |" || die "cannot open $file\n";
return $handle;
}
open $handle,"<$file" || die "cannot open $file\n";
return $handle;
}
sub normalise{
$_[0]=~s/\P{IsAlpha}//gs if ($AlphaOnly);
$_[0] = lc($_[0]) if ($LowerCase);
if ($DecodeSpm){
if ($_[0]=~s/▁/ /g){
$_[0]=~s/ //g;
}
}
}

View File

@ -0,0 +1 @@
newsdev2019-enkk.kaz.gz

View File

@ -0,0 +1 @@
newstest2019-enkk.kaz.gz

1
testsets/eng-kaz_Cyrl Symbolic link
View File

@ -0,0 +1 @@
eng-kaz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test1.de.gz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test1.fi.gz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test2.de.gz

View File

@ -0,0 +1 @@
../de-fi/goethe-institute-test2.fi.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test1.deu.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test1.fin.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test2.deu.gz

View File

@ -0,0 +1 @@
../deu-fin/goethe-institute-test2.fin.gz