diff --git a/bt-tatoeba/Makefile b/bt-tatoeba/Makefile index a28f1363..b37f1327 100644 --- a/bt-tatoeba/Makefile +++ b/bt-tatoeba/Makefile @@ -222,7 +222,7 @@ tatoeba-print-reliable-trg: @echo ${TATOEBA_RELIABLE_TRG} -# RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}} +RELEASED_BT_ALL := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT}} RELEASED_BT := ${shell wget -qq -O - ${TATOEBA_RELEASED_BT} | grep '^${LANGPAIR}/'} fetch-bt: @@ -232,6 +232,13 @@ fetch-bt: wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ done +fetch-all-bt: + for d in ${RELEASED_BT_ALL}; do \ + echo "fetch $$d"; \ + mkdir -p `dirname $$d`; \ + wget -qq -O $$d https://object.pouta.csc.fi/Tatoeba-MT-bt/$$d; \ + done + #--------------------------------------------------------------- # release data diff --git a/lib/config.mk b/lib/config.mk index 13f7f0b8..bf068a91 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -112,8 +112,10 @@ MAX_OVER_SAMPLING ?= 50 # sorted languages and langpair used to match resources in OPUS SORTLANGS = $(sort ${SRC} ${TRG}) +SORTSRC = ${firstword ${SORTLANGS}} +SORTTRG = ${lastword ${SORTLANGS}} +LANGPAIR = ${SORTSRC}-${SORTTRG} SPACE = $(empty) $(empty) -LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}} LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)} LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)} LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR} @@ -128,11 +130,15 @@ LANGSTR ?= ${subst ${SPACE},+,$(LANGS)} ## for same language pairs: add numeric extension ## (this is neccessary to keep source and target files separate) ifeq (${SRC},$(TRG)) - SRCEXT = ${SRC}1 - TRGEXT = ${SRC}2 + SRCEXT = ${SRC}1 + TRGEXT = ${SRC}2 + SORTSRCEXT = ${SORTSRC}1 + SORTTRGEXT = ${SORTSRC}2 else - SRCEXT = ${SRC} - TRGEXT = ${TRG} + SRCEXT = ${SRC} + TRGEXT = ${TRG} + SORTSRCEXT = ${SORTSRC} + SORTTRGEXT = ${SORTTRG} endif ## set a flag to use target language labels diff --git a/lib/data.mk b/lib/data.mk index 7a8382ae..44796350 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -422,7 +422,7 @@ ifdef SHUFFLE_DATA endif ###################################### # FIT_DATA_SIZE is set? -# --> fit data to speciic size +# --> fit data to specific size # --> under/over sampling! ###################################### @echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md diff --git a/lib/env.mk b/lib/env.mk index b595b86e..1fb3208f 100644 --- a/lib/env.mk +++ b/lib/env.mk @@ -121,7 +121,8 @@ else ifneq ($(wildcard /wrk/tiedeman/research),) LOADGPU = module load ${GPU_MODULES} LOADMODS = ${LOADGPU} else ifeq (${shell hostname --domain 2>/dev/null},bullx) - CSCPROJECT = project_2002688 +# CSCPROJECT = project_2002688 + CSCPROJECT = project_2000309 # CSCPROJECT = project_2002982 WORKHOME = ${shell realpath ${PWD}/work} APPLHOME = /projappl/project_2001194 diff --git a/lib/projects/finland.mk b/lib/projects/finland.mk index dee5f3be..5ee852aa 100644 --- a/lib/projects/finland.mk +++ b/lib/projects/finland.mk @@ -50,6 +50,12 @@ fi-zh: train-dynamic.submitcpu + +# Tatoeba: more than 100 test sentences: +# ain dan deu eng enm epo est fkv fra heb hun ita jpn kor kur lat lit nld nor pol por rus spa swe tur zho + + + #------------------------------------------------------------------- # add THL backtranslation data (and also all other backtranslations) #------------------------------------------------------------------- diff --git a/lib/projects/tatoeba.mk b/lib/projects/tatoeba.mk index d985ba87..c33e82ae 100644 --- a/lib/projects/tatoeba.mk +++ b/lib/projects/tatoeba.mk @@ -3,10 +3,6 @@ # Makefile for running models with data from the Tatoeba Translation Challenge # https://github.com/Helsinki-NLP/Tatoeba-Challenge # -# NEWS -# -# - MODELTYPE=transformer is now default for all Tatoeba models -# (no guided alignment!) # #--------------------------------------------------------------------- # train and evaluate a single translation pair, for example: @@ -90,8 +86,7 @@ ## general parameters for Tatoeba models - -## NEW: release +## Tatoeba Challenge Data release number # TATOEBA_VERSION ?= v2020-07-28 TATOEBA_VERSION ?= v2021-08-07 @@ -99,17 +94,17 @@ TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge TATOEBA_TEST_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} TATOEBA_MONO_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} -# TATOEBA_TEST_URL := ${TATOEBA_DATAURL} -# TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL} -# TATOEBA_MONO_URL := ${TATOEBA_DATAURL} -TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master TATOEBA_WORK ?= ${PWD}/work-tatoeba TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE} TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono +# TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +TATOEBA_RAWGIT_MASTER := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +TATOEBA_RAWGIT_RELEASE := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/${TATOEBA_VERSION} + ## data count files (file basename) -TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT}/data/release/${TATOEBA_VERSION}/released-bitexts +TATOEBA_DATA_COUNT_BASE = ${TATOEBA_RAWGIT_MASTER}/data/release/${TATOEBA_VERSION}/released-bitexts ## all released language pairs with test sets > 200 test pairs ## also extract all source languages that are available for a give target language @@ -132,7 +127,7 @@ WIKIMACROLANGS ?= $(sort ${shell ${GET_ISO_CODE} ${WIKILANGS}}) TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models ## this will be the base name of the model file -TATOEBA_DATASET := opusTC$(subst -,,${TATOEBA_VERSION}) +TATOEBA_DATASET := ${DATASET}TC$(subst -,,${TATOEBA_VERSION}) TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION} TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION} @@ -525,21 +520,21 @@ all-tatoeba-langgroups: #### language-group to English -GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS}) +GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-trainjob,${OPUS_LANG_GROUPS}) GROUP2ENG_EVAL := $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS}) GROUP2ENG_EVALALL := $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS}) GROUP2ENG_DIST := $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS}) #### English to language group -ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS}) +ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-trainjob,${OPUS_LANG_GROUPS}) ENG2GROUP_EVAL := $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS}) ENG2GROUP_EVALALL := $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS}) ENG2GROUP_DIST := $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS}) #### multilingual language-group (bi-directional -LANGGROUP_TRAIN := $(foreach G,${OPUS_LANG_GROUPS},tatoeba-${G}2${G}-train) +LANGGROUP_TRAIN := $(foreach G,${OPUS_LANG_GROUPS},tatoeba-${G}2${G}-trainjob) LANGGROUP_EVAL := $(patsubst %-train,%-eval,${LANGGROUP_TRAIN}) LANGGROUP_EVALALL := $(patsubst %-train,%-evalall,${LANGGROUP_TRAIN}) LANGGROUP_DIST := $(patsubst %-train,%-dist,${LANGGROUP_TRAIN}) @@ -547,17 +542,17 @@ LANGGROUP_DIST := $(patsubst %-train,%-dist,${LANGGROUP_TRAIN}) LANGGROUP_FIT_DATA_SIZE=1000000 ## start all jobs with 1 million sampled sentence pairs per language pair +## (OLD: MODELTYPE=transformer) all-tatoeba-group2eng: - ${MAKE} MIN_SRCLANGS=2 MODELTYPE=transformer \ + ${MAKE} MIN_SRCLANGS=2 SKIP_LANGPAIRS="eng-eng" \ FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${GROUP2ENG_TRAIN} all-tatoeba-eng2group: - ${MAKE} MIN_TRGLANGS=2 MODELTYPE=transformer \ + ${MAKE} MIN_TRGLANGS=2 SKIP_LANGPAIRS="eng-eng" \ FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${ENG2GROUP_TRAIN} all-tatoeba-langgroup: - ${MAKE} MIN_SRCLANGS=2 MAX_SRCLANGS=30 PIVOT=eng \ - MODELTYPE=transformer \ + ${MAKE} MIN_SRCLANGS=2 MAX_SRCLANGS=30 PIVOT=eng SKIP_LANGPAIRS="eng-eng" \ FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} ${LANGGROUP_TRAIN} all-tatoeba-cross-langgroups: @@ -566,7 +561,6 @@ all-tatoeba-cross-langgroups: if [ "$$s" != "$$t" ]; then \ ${MAKE} MIN_SRCLANGS=2 MIN_TRGLANGS=2 \ MAX_SRCLANGS=30 MAX_TRGLANGS=30 \ - MODELTYPE=transformer \ FIT_DATA_SIZE=${LANGGROUP_FIT_DATA_SIZE} \ tatoeba-$${s}2$${t}-train; \ fi \ @@ -787,6 +781,10 @@ tatoeba-%-train: fi \ fi ) +test-fiu2eng: + echo "${call find-srclanggroup,${patsubst test-%,%,$@},${PIVOT}}" + echo "${call find-trglanggroup,${patsubst test-%,%,$@},${PIVOT}}" + ## start the training job ## - create config file @@ -1129,7 +1127,7 @@ tatoeba-%-langtunealljobs: ## get the markdown page for a specific subset tatoeba-%.md: - wget -O $@ ${TATOEBA_RAWGIT}/subsets/${patsubst tatoeba-%,%,$@} + wget -O $@ ${TATOEBA_RAWGIT_MASTER}/subsets/${TATOEBA_VERSION}/${patsubst tatoeba-%,%,$@} ## run all language pairs for a given subset @@ -1340,7 +1338,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done: @for s in ${SRCLANGS}; do \ for t in ${TRGLANGS}; do \ wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ - ${TATOEBA_RAWGIT}/data/test/$$s-$$t/test.txt; \ + ${TATOEBA_RAWGIT_RELEASE}/data/test/$$s-$$t/test.txt; \ if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ @@ -1387,7 +1385,7 @@ ${TATOEBA_WORK}/${LANGPAIRSTR}/test/Tatoeba-testsets.done: fi; \ else \ wget -q -O ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp \ - ${TATOEBA_RAWGIT}/data/test/$$t-$$s/test.txt; \ + ${TATOEBA_RAWGIT_RELEASE}/data/test/$$t-$$s/test.txt; \ if [ -s ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp ]; then \ cat ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.tmp $(FIXLANGIDS) \ > ${TATOEBA_WORK}/${LANGPAIRSTR}/test/${TATOEBA_TESTSET}.$$s-$$t.txt; \ @@ -1726,7 +1724,7 @@ KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \ syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \ - ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn heb_Latn nob_Hebr rus_Latn + ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn eng_Tibt eng_Zinh heb_Latn hun_Zinh nob_Hebr rus_Latn SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$ ## modify language IDs in training data to adjust them to test sets @@ -1738,6 +1736,9 @@ SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_TW/cmn\1/g;s/zho/cmn/g;' \ | sed 's/\_[A-Z][A-Z]//g' \ | sed 's/\-[a-z]*//g' \ + | sed 's/\_Brai//g' \ + | sed 's/\_Zinh//g' \ + | sed 's/\_Tibt//g' \ | sed 's/jpn_[A-Za-z]*/jpn/g' \ | sed 's/kor_[A-Za-z]*/kor/g' \ | sed 's/nor_Latn/nor/g' \ @@ -1807,7 +1808,8 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} mv $@.d/${TATOEBA_TMPDATADIR}/test.trg ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.${TRGEXT}; \ cat $@.d/${TATOEBA_TMPDATADIR}/test.id $(FIXLANGIDS) > ${dir $@}${TATOEBA_TESTSET}.${LANGPAIR}.clean.id; \ fi - @if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \ + @if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ] && \ + [ `cat $@.d/${TATOEBA_TMPDATADIR}/dev.src | wc -l` -gt 50 ]; then \ echo "........ move dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \ mv $@.d/${TATOEBA_TMPDATADIR}/dev.src ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \ mv $@.d/${TATOEBA_TMPDATADIR}/dev.trg ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \ @@ -1821,7 +1823,7 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} fi; \ else \ if [ -e $@.d/${TATOEBA_TMPDATADIR}/train.src.gz ]; then \ - echo "no devdata available - get top 1000 from training data!"; \ + echo "........ too little devdata available - get top 1000 from training data!"; \ ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.src.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \ ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | head -1000 > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \ ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \ @@ -1830,6 +1832,12 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.trg.gz | tail -n +1001 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${TRGEXT}; \ ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.id; \ ${GZCAT} $@.d/${TATOEBA_TMPDATADIR}/train.id.gz | tail -n +1001 | cut -f1 > ${dir $@}${TATOEBA_TRAINSET}.${LANGPAIR}.clean.domain; \ + fi; \ + if [ -e $@.d/${TATOEBA_TMPDATADIR}/dev.src ]; then \ + echo "........ add dev files to ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.*"; \ + cat $@.d/${TATOEBA_TMPDATADIR}/dev.src >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${SRCEXT}; \ + cat $@.d/${TATOEBA_TMPDATADIR}/dev.trg >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.${TRGEXT}; \ + cat $@.d/${TATOEBA_TMPDATADIR}/dev.id $(FIXLANGIDS) >> ${dir $@}${TATOEBA_DEVSET}.${LANGPAIR}.clean.id; \ fi \ fi ## make sure that training data file exists even if it is empty @@ -1842,10 +1850,10 @@ TATOEBA_TMPDATADIR = data/release/${TATOEBA_VERSION}/${LANGPAIR} ####################################### @cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \ grep -v '${SKIP_LANGIDS_PATTERN}' | \ - tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels) + tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTSRCEXT}.labels) @cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | \ grep -v '${SKIP_LANGIDS_PATTERN}' | \ - tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels) + tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.${SORTTRGEXT}.labels) @cat ${dir $@}Tatoeba-*.${LANGPAIR}.clean.domain | sort -u |\ tr "\n" ' ' | sed 's/^ *//;s/ *$$//' > $(@:.${SRCEXT}.gz=.domains) ####################################### diff --git a/lib/train.mk b/lib/train.mk index d5d77ca7..a66d50ae 100644 --- a/lib/train.mk +++ b/lib/train.mk @@ -13,7 +13,9 @@ ifeq (${SUBWORDS},spm) ${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL} ifneq (${MODEL_LATEST_VOCAB},) +ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB}) cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} +endif else cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab} cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab} @@ -39,7 +41,9 @@ ${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz ifeq ($(wildcard ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}),) ifneq (${MODEL_LATEST_VOCAB},) +ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB}) cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} +endif else mkdir -p ${dir $@} ${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@ @@ -184,11 +188,15 @@ ${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS} ifeq (${wildcard ${MODEL_START}},) ifneq (${MODEL_LATEST},) ifneq (${MODEL_LATEST_VOCAB},) +ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB}) cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} +endif +ifneq (${MODEL_LATEST},${MODEL_START}) cp ${MODEL_LATEST} ${MODEL_START} endif endif endif +endif ##-------------------------------------------------------------------- ${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} ${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \ diff --git a/scripts/evaluate/check-overlap.pl b/scripts/evaluate/check-overlap.pl new file mode 100755 index 00000000..8cb9c962 --- /dev/null +++ b/scripts/evaluate/check-overlap.pl @@ -0,0 +1,124 @@ +#!/usr/bin/env perl + +use utf8; +use strict; +use open qw/:std :utf8/; +use Getopt::Long; + +my $AlphaOnly = 0; +my $LowerCase = 0; +my $DecodeSpm = 0; +my $verbose = 0; + +GetOptions( + "alpha|a" => \$AlphaOnly, + "lower-case|l" => \$LowerCase, + "decode-spm|d" => \$DecodeSpm, + "verbose|v" => \$verbose ); + +my $BigSrcFile = shift(@ARGV); +my $BigTrgFile = shift(@ARGV); + +my %SrcSents = (); +my %TrgSents = (); +my %SentPairs = (); + + + +while (@ARGV){ + my $SrcFile = shift(@ARGV); + my $TrgFile = shift(@ARGV); + read_pairs($SrcFile,$TrgFile); +} + + +my $S = open_file($BigSrcFile); +my $T = open_file($BigTrgFile); + + +my $total = 0; +my ($SrcExists,$TrgExists,$PairExists) = (0,0,0); +my %SrcUniqueExists = (); +my %TrgUniqueExists = (); +my %PairUniqueExists = (); + + +while (<$S>){ + my $trg = <$T>; + &normalise($_); + &normalise($trg); + $total++; + if (exists $SrcSents{$_}){ + $SrcExists++; + $SrcUniqueExists{$_}++; + } + if (exists $TrgSents{$trg}){ + $TrgExists++; + $TrgUniqueExists{$trg}++; + } + if (exists $SentPairs{"$_\t$trg"}){ + $PairExists++; + chomp; + unless (exists $PairUniqueExists{"$_\t$trg"}){ + print STDERR "exists: $_\t$trg\n" if ($verbose); + $PairUniqueExists{"$_\t$trg"}++; + } + } +} + +my $TotalSmall = scalar keys %SentPairs; +if ($total){ + printf "source sentences from train found in devtest\t%d\t%5.2f\%\n",$SrcExists,100*$SrcExists/$total; + printf "target sentences from train found in devtest\t%d\t%5.2f\%\n",$TrgExists,100*$TrgExists/$total; + printf " sentence pairs from train found in devtest\t%d\t%5.2f\%\n",$PairExists,100*$PairExists/$total; + print "total size of training data\t",$total,"\n"; +} +if ($TotalSmall){ + my $SrcExistsSmall = scalar keys %SrcUniqueExists; + my $TrgExistsSmall = scalar keys %TrgUniqueExists; + my $PairExistsSmall = scalar keys %PairUniqueExists; + printf "source sentences from devtest found in train\t%d\t%5.2f\%\n",$SrcExistsSmall,100*$SrcExistsSmall/$TotalSmall; + printf "target sentences from devtest found in train\t%d\t%5.2f\%\n",$TrgExistsSmall,100*$TrgExistsSmall/$TotalSmall; + printf " sentence pairs from devtest found in train\t%d\t%5.2f\%\n",$PairExistsSmall,100*$PairExistsSmall/$TotalSmall; + print "total size of devtest data\t",$TotalSmall,"\n"; +} + + +sub read_pairs{ + my ($SrcFile,$TrgFile) = @_; + my $S = open_file($SrcFile); + my $T = open_file($TrgFile); + while (<$S>){ + my $trg = <$T>; + &normalise($_); + &normalise($trg); + $SrcSents{$_} = 1; + $TrgSents{$trg} = 1; + $SentPairs{"$_\t$trg"} = 1; + } + close $S; + close $T; +} + + +sub open_file{ + my $file = shift; + my $handle; + if ($file=~/\.gz$/){ + open $handle,"gzip -cd <$file |" || die "cannot open $file\n"; + return $handle; + } + open $handle,"<$file" || die "cannot open $file\n"; + return $handle; +} + + +sub normalise{ + $_[0]=~s/\P{IsAlpha}//gs if ($AlphaOnly); + $_[0] = lc($_[0]) if ($LowerCase); + if ($DecodeSpm){ + if ($_[0]=~s/▁/ /g){ + $_[0]=~s/ //g; + } + } +} diff --git a/testsets/en-kk/newsdev2019-enkk.kaz_Cyrl.gz b/testsets/en-kk/newsdev2019-enkk.kaz_Cyrl.gz new file mode 120000 index 00000000..b54298e1 --- /dev/null +++ b/testsets/en-kk/newsdev2019-enkk.kaz_Cyrl.gz @@ -0,0 +1 @@ +newsdev2019-enkk.kaz.gz \ No newline at end of file diff --git a/testsets/en-kk/newstest2019-enkk.kaz_Cyrl.gz b/testsets/en-kk/newstest2019-enkk.kaz_Cyrl.gz new file mode 120000 index 00000000..ce8b0d7e --- /dev/null +++ b/testsets/en-kk/newstest2019-enkk.kaz_Cyrl.gz @@ -0,0 +1 @@ +newstest2019-enkk.kaz.gz \ No newline at end of file diff --git a/testsets/eng-kaz_Cyrl b/testsets/eng-kaz_Cyrl new file mode 120000 index 00000000..fbf73208 --- /dev/null +++ b/testsets/eng-kaz_Cyrl @@ -0,0 +1 @@ +eng-kaz \ No newline at end of file diff --git a/testsets/fi-de/goethe-institute-test1.de.gz b/testsets/fi-de/goethe-institute-test1.de.gz new file mode 120000 index 00000000..1e544ad0 --- /dev/null +++ b/testsets/fi-de/goethe-institute-test1.de.gz @@ -0,0 +1 @@ +../de-fi/goethe-institute-test1.de.gz \ No newline at end of file diff --git a/testsets/fi-de/goethe-institute-test1.fi.gz b/testsets/fi-de/goethe-institute-test1.fi.gz new file mode 120000 index 00000000..638b47f5 --- /dev/null +++ b/testsets/fi-de/goethe-institute-test1.fi.gz @@ -0,0 +1 @@ +../de-fi/goethe-institute-test1.fi.gz \ No newline at end of file diff --git a/testsets/fi-de/goethe-institute-test2.de.gz b/testsets/fi-de/goethe-institute-test2.de.gz new file mode 120000 index 00000000..f18422a2 --- /dev/null +++ b/testsets/fi-de/goethe-institute-test2.de.gz @@ -0,0 +1 @@ +../de-fi/goethe-institute-test2.de.gz \ No newline at end of file diff --git a/testsets/fi-de/goethe-institute-test2.fi.gz b/testsets/fi-de/goethe-institute-test2.fi.gz new file mode 120000 index 00000000..fb83dc69 --- /dev/null +++ b/testsets/fi-de/goethe-institute-test2.fi.gz @@ -0,0 +1 @@ +../de-fi/goethe-institute-test2.fi.gz \ No newline at end of file diff --git a/testsets/fin-deu/goethe-institute-test1.deu.gz b/testsets/fin-deu/goethe-institute-test1.deu.gz new file mode 120000 index 00000000..4177dad5 --- /dev/null +++ b/testsets/fin-deu/goethe-institute-test1.deu.gz @@ -0,0 +1 @@ +../deu-fin/goethe-institute-test1.deu.gz \ No newline at end of file diff --git a/testsets/fin-deu/goethe-institute-test1.fin.gz b/testsets/fin-deu/goethe-institute-test1.fin.gz new file mode 120000 index 00000000..838e75f9 --- /dev/null +++ b/testsets/fin-deu/goethe-institute-test1.fin.gz @@ -0,0 +1 @@ +../deu-fin/goethe-institute-test1.fin.gz \ No newline at end of file diff --git a/testsets/fin-deu/goethe-institute-test2.deu.gz b/testsets/fin-deu/goethe-institute-test2.deu.gz new file mode 120000 index 00000000..794330f9 --- /dev/null +++ b/testsets/fin-deu/goethe-institute-test2.deu.gz @@ -0,0 +1 @@ +../deu-fin/goethe-institute-test2.deu.gz \ No newline at end of file diff --git a/testsets/fin-deu/goethe-institute-test2.fin.gz b/testsets/fin-deu/goethe-institute-test2.fin.gz new file mode 120000 index 00000000..31a6a24f --- /dev/null +++ b/testsets/fin-deu/goethe-institute-test2.fin.gz @@ -0,0 +1 @@ +../deu-fin/goethe-institute-test2.fin.gz \ No newline at end of file