mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
tatoeba models added
This commit is contained in:
parent
ec43fcd30a
commit
eeaef7768c
1
Makefile
1
Makefile
@ -181,6 +181,7 @@ include lib/models/wikimatrix.mk
|
||||
include lib/models/doclevel.mk
|
||||
include lib/models/simplify.mk
|
||||
|
||||
include lib/models/tatoeba.mk
|
||||
|
||||
|
||||
.PHONY: all
|
||||
|
@ -354,6 +354,12 @@ endif
|
||||
|
||||
local-config: ${WORKDIR}/config.mk
|
||||
|
||||
SMALLEST_TRAINSIZE = 10000
|
||||
SMALL_TRAINSIZE = 100000
|
||||
MEDIUM_TRAINSIZE = 500000
|
||||
LARGE_TRAINSIZE = 1000000
|
||||
LARGEST_TRAINSIZE = 10000000
|
||||
|
||||
${WORKDIR}/config.mk:
|
||||
mkdir -p ${dir $@}
|
||||
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
|
||||
@ -363,23 +369,23 @@ ${WORKDIR}/config.mk:
|
||||
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
|
||||
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
|
||||
fi; \
|
||||
if [ $$s -gt 10000000 ]; then \
|
||||
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = -multigpu" >> $@; \
|
||||
elif [ $$s -gt 1000000 ]; then \
|
||||
elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than 1 million" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
||||
elif [ $$s -gt 500000 ]; then \
|
||||
elif [ $$s -gt ${MEDIUM_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than 500k" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
||||
echo "MARIAN_WORKSPACE = 10000" >> $@; \
|
||||
echo "BPESIZE = 12000" >> $@; \
|
||||
elif [ $$s -gt 100000 ]; then \
|
||||
elif [ $$s -gt ${SMALL_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than 100k" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
@ -391,7 +397,7 @@ ${WORKDIR}/config.mk:
|
||||
echo "DEVSIZE = 1000" >> $@; \
|
||||
echo "TESTSIZE = 1000" >> $@; \
|
||||
echo "DEVMINSIZE = 250" >> $@; \
|
||||
elif [ $$s -gt 10000 ]; then \
|
||||
elif [ $$s -gt ${SMALLEST_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data less than 100k" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
|
@ -557,6 +557,7 @@ ${DEV_TRG}: ${DEV_SRC}
|
||||
|
||||
|
||||
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
|
||||
mkdir -p ${dir ${DEV_SRC}}
|
||||
ifneq (${wildcard ${CLEAN_DEV_SRC}},)
|
||||
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
|
||||
zcat ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
|
92
lib/models/tatoeba.mk
Normal file
92
lib/models/tatoeba.mk
Normal file
@ -0,0 +1,92 @@
|
||||
# -*-makefile-*-
|
||||
|
||||
TATOEBA_DATA = https://object.pouta.csc.fi/Tatoeba-Challenge
|
||||
TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_WORK = ${PWD}/work-tatoeba
|
||||
|
||||
|
||||
tatoeba-prepare:
|
||||
${MAKE} local-config-tatoeba
|
||||
${MAKE} data-tatoeba
|
||||
|
||||
tatoeba-train:
|
||||
${MAKE} train-tatoeba
|
||||
|
||||
tatoeba-eval:
|
||||
${MAKE} compare-tatoeba
|
||||
|
||||
|
||||
## run all language pairs for a given subset
|
||||
tatoeba-%: tatoeba-%.md
|
||||
for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
|
||||
s=`echo $$l | cut -f1 -d '-'`; \
|
||||
t=`echo $$l | cut -f2 -d '-'`; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-prepare; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t all-job-tatoeba; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t reverse-data-tatoeba; \
|
||||
${MAKE} SRCLANGS=$$t TRGLANGS=$$s all-job-tatoeba; \
|
||||
done
|
||||
|
||||
## get the markdown page for a specific subset
|
||||
tatoeba-%.md:
|
||||
wget -O $@ ${TATOEBA_RAWGIT}/subsets/${patsubst tatoeba-%,%,$@}
|
||||
|
||||
## generic target for tatoeba challenge jobs
|
||||
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo $<
|
||||
${MAKE} TRAINSET=Tatoeba-train \
|
||||
DEVSET=Tatoeba-dev \
|
||||
TESTSET=Tatoeba-test \
|
||||
TESTSET_NAME=Tatoeba-test \
|
||||
SMALLEST_TRAINSIZE=1000 \
|
||||
USE_REST_DEVDATA=0 \
|
||||
HELDOUTSIZE=0 \
|
||||
DEVSIZE=5000 \
|
||||
TESTSIZE=10000 \
|
||||
DEVMINSIZE=200 \
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
${@:-tatoeba=}
|
||||
|
||||
|
||||
|
||||
## don't delete those files
|
||||
.SECONDARY: ${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
|
||||
${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz \
|
||||
${TATOEBA_WORK}/data/${PRE}/Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz \
|
||||
${TATOEBA_WORK}/data/${PRE}/Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz \
|
||||
${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz \
|
||||
${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
|
||||
|
||||
## TODO: should we do some filtering like bitext-match, OPUS-filter ...
|
||||
%/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz:
|
||||
mkdir -p $@.d
|
||||
wget -q -O $@.d/train.tar ${TATOEBA_DATA}/${LANGPAIR}.tar
|
||||
tar -C $@.d -xf $@.d/train.tar
|
||||
gzip -c < $@.d/data/${LANGPAIR}/test.src > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
gzip -c < $@.d/data/${LANGPAIR}/test.trg > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
|
||||
if [ -e $@.d/data/${LANGPAIR}/dev.src ]; then \
|
||||
gzip -c < $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
gzip -c < $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
mv $@.d/data/${LANGPAIR}/train.src.gz ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
mv $@.d/data/${LANGPAIR}/train.trg.gz ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
else \
|
||||
echo "no devdata available - get top 1000 from training data!"; \
|
||||
zcat $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 | gzip -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
zcat $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 | gzip -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
zcat $@.d/data/${LANGPAIR}/train.src.gz | head -1000 | gzip -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
zcat $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 | gzip -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
fi
|
||||
rm -f $@.d/data/${LANGPAIR}/*
|
||||
rmdir $@.d/data/${LANGPAIR}
|
||||
rmdir $@.d/data
|
||||
rm -f $@.d/train.tar
|
||||
rmdir $@.d
|
||||
|
||||
%/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo "done!"
|
||||
|
||||
%/Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo "done!"
|
||||
|
||||
%/Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz %/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo "done!"
|
@ -3,6 +3,7 @@
|
||||
|
||||
|
||||
import pycld2 as cld2
|
||||
from iso639 import languages
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
@ -65,6 +66,23 @@ if args.checklang:
|
||||
quit()
|
||||
|
||||
|
||||
if not supported_language(args.srclang):
|
||||
if len(args.srclang) == 3:
|
||||
# print(args.srclang + " is 3 characters long")
|
||||
langid = languages.get(part3=args.srclang).part1
|
||||
if langid:
|
||||
args.srclang = langid
|
||||
# print("set to " + args.srclang)
|
||||
|
||||
if not supported_language(args.trglang):
|
||||
if len(args.trglang) == 3:
|
||||
# print(args.trglang + " is 3 characters long")
|
||||
langid = languages.get(part3=args.trglang).part1
|
||||
if langid:
|
||||
args.trglang = langid
|
||||
# print("set to " + args.trglang)
|
||||
|
||||
|
||||
if not supported_language(args.srclang):
|
||||
# print(args.srclang + " is not supported")
|
||||
srcreject = 'en'
|
||||
|
Loading…
Reference in New Issue
Block a user