tatoeba models added

This commit is contained in:
Joerg Tiedemann 2020-06-03 00:16:21 +03:00
parent ec43fcd30a
commit eeaef7768c
5 changed files with 123 additions and 5 deletions

View File

@ -181,6 +181,7 @@ include lib/models/wikimatrix.mk
include lib/models/doclevel.mk
include lib/models/simplify.mk
include lib/models/tatoeba.mk
.PHONY: all

View File

@ -354,6 +354,12 @@ endif
local-config: ${WORKDIR}/config.mk
SMALLEST_TRAINSIZE = 10000
SMALL_TRAINSIZE = 100000
MEDIUM_TRAINSIZE = 500000
LARGE_TRAINSIZE = 1000000
LARGEST_TRAINSIZE = 10000000
${WORKDIR}/config.mk:
mkdir -p ${dir $@}
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
@ -363,23 +369,23 @@ ${WORKDIR}/config.mk:
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
fi; \
if [ $$s -gt 10000000 ]; then \
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = -multigpu" >> $@; \
elif [ $$s -gt 1000000 ]; then \
elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 1 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
elif [ $$s -gt 500000 ]; then \
elif [ $$s -gt ${MEDIUM_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 500k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "MARIAN_WORKSPACE = 10000" >> $@; \
echo "BPESIZE = 12000" >> $@; \
elif [ $$s -gt 100000 ]; then \
elif [ $$s -gt ${SMALL_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
@ -391,7 +397,7 @@ ${WORKDIR}/config.mk:
echo "DEVSIZE = 1000" >> $@; \
echo "TESTSIZE = 1000" >> $@; \
echo "DEVMINSIZE = 250" >> $@; \
elif [ $$s -gt 10000 ]; then \
elif [ $$s -gt ${SMALLEST_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data less than 100k" > $@; \
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \

View File

@ -557,6 +557,7 @@ ${DEV_TRG}: ${DEV_SRC}
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
mkdir -p ${dir ${DEV_SRC}}
ifneq (${wildcard ${CLEAN_DEV_SRC}},)
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
zcat ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md

92
lib/models/tatoeba.mk Normal file
View File

@ -0,0 +1,92 @@
# -*-makefile-*-
TATOEBA_DATA = https://object.pouta.csc.fi/Tatoeba-Challenge
TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_WORK = ${PWD}/work-tatoeba
tatoeba-prepare:
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba
tatoeba-train:
${MAKE} train-tatoeba
tatoeba-eval:
${MAKE} compare-tatoeba
## run all language pairs for a given subset
tatoeba-%: tatoeba-%.md
for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
s=`echo $$l | cut -f1 -d '-'`; \
t=`echo $$l | cut -f2 -d '-'`; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-prepare; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t all-job-tatoeba; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t reverse-data-tatoeba; \
${MAKE} SRCLANGS=$$t TRGLANGS=$$s all-job-tatoeba; \
done
## get the markdown page for a specific subset
tatoeba-%.md:
wget -O $@ ${TATOEBA_RAWGIT}/subsets/${patsubst tatoeba-%,%,$@}
## generic target for tatoeba challenge jobs
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo $<
${MAKE} TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
TESTSET=Tatoeba-test \
TESTSET_NAME=Tatoeba-test \
SMALLEST_TRAINSIZE=1000 \
USE_REST_DEVDATA=0 \
HELDOUTSIZE=0 \
DEVSIZE=5000 \
TESTSIZE=10000 \
DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \
${@:-tatoeba=}
## don't delete those files
.SECONDARY: ${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz \
${TATOEBA_WORK}/data/${PRE}/Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz \
${TATOEBA_WORK}/data/${PRE}/Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz \
${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz \
${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
## TODO: should we do some filtering like bitext-match, OPUS-filter ...
%/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz:
mkdir -p $@.d
wget -q -O $@.d/train.tar ${TATOEBA_DATA}/${LANGPAIR}.tar
tar -C $@.d -xf $@.d/train.tar
gzip -c < $@.d/data/${LANGPAIR}/test.src > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz
gzip -c < $@.d/data/${LANGPAIR}/test.trg > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
if [ -e $@.d/data/${LANGPAIR}/dev.src ]; then \
gzip -c < $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
gzip -c < $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
mv $@.d/data/${LANGPAIR}/train.src.gz ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
mv $@.d/data/${LANGPAIR}/train.trg.gz ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
else \
echo "no devdata available - get top 1000 from training data!"; \
zcat $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 | gzip -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
zcat $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 | gzip -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
zcat $@.d/data/${LANGPAIR}/train.src.gz | head -1000 | gzip -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
zcat $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 | gzip -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
fi
rm -f $@.d/data/${LANGPAIR}/*
rmdir $@.d/data/${LANGPAIR}
rmdir $@.d/data
rm -f $@.d/train.tar
rmdir $@.d
%/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo "done!"
%/Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo "done!"
%/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo "done!"

View File

@ -3,6 +3,7 @@
import pycld2 as cld2
from iso639 import languages
import sys
import argparse
@ -65,6 +66,23 @@ if args.checklang:
quit()
if not supported_language(args.srclang):
if len(args.srclang) == 3:
# print(args.srclang + " is 3 characters long")
langid = languages.get(part3=args.srclang).part1
if langid:
args.srclang = langid
# print("set to " + args.srclang)
if not supported_language(args.trglang):
if len(args.trglang) == 3:
# print(args.trglang + " is 3 characters long")
langid = languages.get(part3=args.trglang).part1
if langid:
args.trglang = langid
# print("set to " + args.trglang)
if not supported_language(args.srclang):
# print(args.srclang + " is not supported")
srcreject = 'en'