fixes with pivoting

This commit is contained in:
Joerg Tiedemann 2020-05-18 21:36:53 +03:00
parent b01b4f22c3
commit 04d72ff8ed
5 changed files with 110 additions and 17 deletions

View File

@ -164,6 +164,13 @@ all-wikilangs: index.html
${MAKE} LANGID=$$l extract-text; \
done
all-wikis-all-langs: index.html
for l in ${WIKILANGS}; do \
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w LANGID=$$l extract-text; \
done \
done
## aux function to pring the selected modelname
.PHONY: print-modelname
@ -799,3 +806,5 @@ check-length:
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23 918503424
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42 848846623
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40 661585920

View File

@ -360,13 +360,20 @@ add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
echo ${CLEAN_TRAIN_TRG}; \
fi
ifneq (${CLEAN_TRAIN_SRC},)
echo -n "* ${SRC}-${TRG}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo -n "* ${SRC}-${TRG} (synthetic): ${basename ${patsubst %.${SRC}.gz,%,${notdir ${BACKTRANS_SRC}}}}, " \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
for d in ${CLEAN_TRAIN_SRC}; do \
l=`zcat $$d | wc -l`; \
if [ $$l -gt 0 ]; then \
echo "$$d" | xargs basename | \
sed -e 's#.${SRC}.gz$$##' \
-e 's#.clean$$##'\
-e 's#.${LANGPAIR}$$##' | tr "\n" ' ' >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
echo -n "($$l) " >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
fi \
done
echo "" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
zcat ${CLEAN_TRAIN_SRC} |\
@ -381,17 +388,41 @@ endif
# echo "* ${SRC}-${TRG}: ${TRAINSET}" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
# echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
# zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
# for d in ${CLEAN_TRAIN_SRC}; do \
# l=`zcat $$d | wc -l`; \
# if [ $$l -gt 0 ]; then \
# echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
# echo -n "$$d" | xargs basename | \
# sed -e 's#.${SRC}.gz$$##' \
# -e 's#.clean$$##'\
# -e 's#.${LANGPAIR}$$##' | tr "\n" ' ' >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
# echo " = $$l lines" >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
# fi \
# done
## extract training data but keep some heldout data for each dataset
add-to-local-train-and-heldout-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
ifneq (${CLEAN_TRAIN_SRC},)
echo -n "* ${LANGPAIR}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo -n "* ${LANGPAIR} backtranslations: ${basename ${basename ${BACKTRANS_SRC}}}, " \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
for d in ${CLEAN_TRAIN_SRC}; do \
l=`zcat $$d | wc -l`; \
if [ $$l -gt 0 ]; then \
echo "$$d" | xargs basename | \
sed -e 's#.${SRC}.gz$$##' \
-e 's#.clean$$##'\
-e 's#.${LANGPAIR}$$##' | tr "\n" ' ' >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
echo -n "($$l) " >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
fi \
done
echo "" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
mkdir -p ${HELDOUT_DIR}/${SRC}-${TRG}
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";

View File

@ -3,6 +3,15 @@
# some helper functions
#------------------------------------------------------------------------
ALL_DATA_SETS = ${patsubst %.${SRCEXT}.gz,%,${CLEAN_TRAIN_SRC}}
check-bitext-length:
for d in ${ALL_DATA_SETS}; do \
if [ `zcat $$d.${SRCEXT}.gz | wc -l` != `zcat $$d.${TRGEXT}.gz | wc -l` ]; then \
echo "not the same number of lines in $$d"; \
fi \
done
## check whether a model is converged or not
finished:

View File

@ -3,12 +3,24 @@
# models for Celtic languages
#-------------------------------------------------------------------
# examples:
#
# make train-celtic-english
# make train-bt-celtic-english
# make train-pivot-bt-celtic-english
#
# make HPC_CORES=2 HPC_MEM=8g all-job-pivot-bt-english-celtic.submitcpu
# make HPC_CORES=2 HPC_MEM=8g CELTIC_BPESIZE=12000 all-job-pivot-bt-celtic-english.submitcpu
## only OPUS data
## reduce vocabulary
# CELTIC_BPESIZE = 12000
CELTIC_BPESIZE = 4000
## only OPUS data
%-celtic-english-opus:
${MAKE} HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} SRCLANGS="ga cy br gd kw gv" TRGLANGS=en ${@:-celtic-english-opus=}

View File

@ -2,6 +2,9 @@
# pivoting - translate training data via pivot models
#
#
# TODO: exclude certain corpora like GNOME Ubuntu, (bible-uedin) ...
# ---> this will basically be the same for all languages
#
PWD := $(shell pwd)
@ -12,6 +15,15 @@ TRG = se
## pivot language
PIVOT = nb
## set EXCLUDE_SELECTED to 1 if you want to exclude only selected corpora
## otherwise: exclude all corpora that also include the target language pair
EXCLUDE_SELECTED = 0
EXCLUDE = bible-uedin DGT GlobalVoices GNOME infopankki KDE4 KDEdoc Tanzil Ubuntu
## always include those data sets even if they have the same target language pair
INCLUDE = Tatoeba OpenSubtitles
## langpair (sorted lang id's) of the original data
## to be translated from PIVOT to SRC
ORIGINAL_LANGPAIR = ${firstword ${sort ${PIVOT} ${TRG}}}-${lastword ${sort ${PIVOT} ${TRG}}}
@ -21,9 +33,25 @@ include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
ORIGINAL_DATADIR ?= ${PWD}/../work/data
ORIGINAL_DATASETS_SRC = ${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${ORIGINAL_LANGPAIR}.clean.${PIVOT}.gz}
ORIGINAL_DATASETS_TRG = ${patsubst %.${PIVOT}.gz,%.${TRG}.gz,${ORIGINAL_DATASRC}}
ifeq (${EXCLUDE_SELECTED},1)
EXCLUDE_PATTERN = ${patsubst %,${ORIGINAL_DATADIR}/${PRE}/%.${LANGPAIR}.clean.${TRG}.gz,${EXCLUDE}}
EXCLUDE_DATASETS = ${filter-out \
${patsubst %,${ORIGINAL_DATADIR}/${PRE}/%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,${INCLUDE}},\
${patsubst %.${LANGPAIR}.clean.${TRG}.gz,%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,\
${filter ${EXCLUDE_PATTERN}, \
${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${LANGPAIR}.clean.${TRG}.gz}}}}
else
EXCLUDE_DATASETS = ${filter-out \
${patsubst %,${ORIGINAL_DATADIR}/${PRE}/%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,${INCLUDE}},\
${patsubst %.${LANGPAIR}.clean.${TRG}.gz,%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,\
${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${LANGPAIR}.clean.${TRG}.gz}}}
endif
ORIGINAL_DATASETS_TRG = ${filter-out ${EXCLUDE_DATASETS},\
${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz}}
ORIGINAL_DATASETS_SRC = ${patsubst %.${TRG}.gz,%.${PIVOT}.gz,${ORIGINAL_DATASETS_TRG}}
ORIGINAL_DATASRC ?= ${firstword ${ORIGINAL_DATASETS_SRC}}
ORIGINAL_DATATRG ?= ${firstword ${ORIGINAL_DATASETS_TRG}}
@ -166,6 +194,10 @@ print-all-data:
@echo "${ALL_TRANSLATED_LATEST_SRC}"
@echo "${ALL_TRANSLATED_LATEST_TRG}"
print-excludes:
@echo ${patsubst %,%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,${INCLUDE}}
@echo "${EXCLUDE_PATTERN}"
@echo "${EXCLUDE_DATASETS}"
## fetch the latest model