mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
fixes with pivoting
This commit is contained in:
parent
b01b4f22c3
commit
04d72ff8ed
@ -164,6 +164,13 @@ all-wikilangs: index.html
|
||||
${MAKE} LANGID=$$l extract-text; \
|
||||
done
|
||||
|
||||
all-wikis-all-langs: index.html
|
||||
for l in ${WIKILANGS}; do \
|
||||
for w in ${WIKISOURCES}; do \
|
||||
${MAKE} WIKISOURCE=$$w LANGID=$$l extract-text; \
|
||||
done \
|
||||
done
|
||||
|
||||
|
||||
## aux function to pring the selected modelname
|
||||
.PHONY: print-modelname
|
||||
@ -799,3 +806,5 @@ check-length:
|
||||
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23 918503424
|
||||
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42 848846623
|
||||
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40 661585920
|
||||
|
||||
|
||||
|
59
lib/data.mk
59
lib/data.mk
@ -360,13 +360,20 @@ add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
|
||||
echo ${CLEAN_TRAIN_TRG}; \
|
||||
fi
|
||||
ifneq (${CLEAN_TRAIN_SRC},)
|
||||
echo -n "* ${SRC}-${TRG}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
ifeq (${USE_BACKTRANS},1)
|
||||
echo -n "* ${SRC}-${TRG} (synthetic): ${basename ${patsubst %.${SRC}.gz,%,${notdir ${BACKTRANS_SRC}}}}, " \
|
||||
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
endif
|
||||
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
for d in ${CLEAN_TRAIN_SRC}; do \
|
||||
l=`zcat $$d | wc -l`; \
|
||||
if [ $$l -gt 0 ]; then \
|
||||
echo "$$d" | xargs basename | \
|
||||
sed -e 's#.${SRC}.gz$$##' \
|
||||
-e 's#.clean$$##'\
|
||||
-e 's#.${LANGPAIR}$$##' | tr "\n" ' ' >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
echo -n "($$l) " >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
fi \
|
||||
done
|
||||
echo "" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
echo "more than one target language";
|
||||
zcat ${CLEAN_TRAIN_SRC} |\
|
||||
@ -381,17 +388,41 @@ endif
|
||||
|
||||
|
||||
|
||||
# echo "* ${SRC}-${TRG}: ${TRAINSET}" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
# echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
# zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
# for d in ${CLEAN_TRAIN_SRC}; do \
|
||||
# l=`zcat $$d | wc -l`; \
|
||||
# if [ $$l -gt 0 ]; then \
|
||||
# echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
# echo -n "$$d" | xargs basename | \
|
||||
# sed -e 's#.${SRC}.gz$$##' \
|
||||
# -e 's#.clean$$##'\
|
||||
# -e 's#.${LANGPAIR}$$##' | tr "\n" ' ' >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
# echo " = $$l lines" >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
# fi \
|
||||
# done
|
||||
|
||||
|
||||
|
||||
|
||||
## extract training data but keep some heldout data for each dataset
|
||||
add-to-local-train-and-heldout-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
|
||||
ifneq (${CLEAN_TRAIN_SRC},)
|
||||
echo -n "* ${LANGPAIR}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
ifeq (${USE_BACKTRANS},1)
|
||||
echo -n "* ${LANGPAIR} backtranslations: ${basename ${basename ${BACKTRANS_SRC}}}, " \
|
||||
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
endif
|
||||
echo -n "* ${SRC}-${TRG}: " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
for d in ${CLEAN_TRAIN_SRC}; do \
|
||||
l=`zcat $$d | wc -l`; \
|
||||
if [ $$l -gt 0 ]; then \
|
||||
echo "$$d" | xargs basename | \
|
||||
sed -e 's#.${SRC}.gz$$##' \
|
||||
-e 's#.clean$$##'\
|
||||
-e 's#.${LANGPAIR}$$##' | tr "\n" ' ' >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
echo -n "($$l) " >> ${dir ${LOCAL_TRAIN_SRC}}README.md; \
|
||||
fi \
|
||||
done
|
||||
echo "" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
mkdir -p ${HELDOUT_DIR}/${SRC}-${TRG}
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
echo "more than one target language";
|
||||
|
@ -3,6 +3,15 @@
|
||||
# some helper functions
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
ALL_DATA_SETS = ${patsubst %.${SRCEXT}.gz,%,${CLEAN_TRAIN_SRC}}
|
||||
|
||||
check-bitext-length:
|
||||
for d in ${ALL_DATA_SETS}; do \
|
||||
if [ `zcat $$d.${SRCEXT}.gz | wc -l` != `zcat $$d.${TRGEXT}.gz | wc -l` ]; then \
|
||||
echo "not the same number of lines in $$d"; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
## check whether a model is converged or not
|
||||
finished:
|
||||
|
@ -3,12 +3,24 @@
|
||||
# models for Celtic languages
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
# examples:
|
||||
#
|
||||
# make train-celtic-english
|
||||
# make train-bt-celtic-english
|
||||
# make train-pivot-bt-celtic-english
|
||||
#
|
||||
# make HPC_CORES=2 HPC_MEM=8g all-job-pivot-bt-english-celtic.submitcpu
|
||||
# make HPC_CORES=2 HPC_MEM=8g CELTIC_BPESIZE=12000 all-job-pivot-bt-celtic-english.submitcpu
|
||||
|
||||
## only OPUS data
|
||||
|
||||
## reduce vocabulary
|
||||
|
||||
# CELTIC_BPESIZE = 12000
|
||||
CELTIC_BPESIZE = 4000
|
||||
|
||||
|
||||
## only OPUS data
|
||||
|
||||
%-celtic-english-opus:
|
||||
${MAKE} HELDOUTSIZE=0 BPESIZE=${CELTIC_BPESIZE} SRCLANGS="ga cy br gd kw gv" TRGLANGS=en ${@:-celtic-english-opus=}
|
||||
|
||||
|
@ -2,6 +2,9 @@
|
||||
# pivoting - translate training data via pivot models
|
||||
#
|
||||
#
|
||||
# TODO: exclude certain corpora like GNOME Ubuntu, (bible-uedin) ...
|
||||
# ---> this will basically be the same for all languages
|
||||
#
|
||||
|
||||
PWD := $(shell pwd)
|
||||
|
||||
@ -12,6 +15,15 @@ TRG = se
|
||||
## pivot language
|
||||
PIVOT = nb
|
||||
|
||||
## set EXCLUDE_SELECTED to 1 if you want to exclude only selected corpora
|
||||
## otherwise: exclude all corpora that also include the target language pair
|
||||
EXCLUDE_SELECTED = 0
|
||||
EXCLUDE = bible-uedin DGT GlobalVoices GNOME infopankki KDE4 KDEdoc Tanzil Ubuntu
|
||||
|
||||
## always include those data sets even if they have the same target language pair
|
||||
INCLUDE = Tatoeba OpenSubtitles
|
||||
|
||||
|
||||
## langpair (sorted lang id's) of the original data
|
||||
## to be translated from PIVOT to SRC
|
||||
ORIGINAL_LANGPAIR = ${firstword ${sort ${PIVOT} ${TRG}}}-${lastword ${sort ${PIVOT} ${TRG}}}
|
||||
@ -21,9 +33,25 @@ include ../lib/env.mk
|
||||
include ../lib/config.mk
|
||||
include ../lib/slurm.mk
|
||||
|
||||
|
||||
ORIGINAL_DATADIR ?= ${PWD}/../work/data
|
||||
ORIGINAL_DATASETS_SRC = ${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${ORIGINAL_LANGPAIR}.clean.${PIVOT}.gz}
|
||||
ORIGINAL_DATASETS_TRG = ${patsubst %.${PIVOT}.gz,%.${TRG}.gz,${ORIGINAL_DATASRC}}
|
||||
|
||||
ifeq (${EXCLUDE_SELECTED},1)
|
||||
EXCLUDE_PATTERN = ${patsubst %,${ORIGINAL_DATADIR}/${PRE}/%.${LANGPAIR}.clean.${TRG}.gz,${EXCLUDE}}
|
||||
EXCLUDE_DATASETS = ${filter-out \
|
||||
${patsubst %,${ORIGINAL_DATADIR}/${PRE}/%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,${INCLUDE}},\
|
||||
${patsubst %.${LANGPAIR}.clean.${TRG}.gz,%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,\
|
||||
${filter ${EXCLUDE_PATTERN}, \
|
||||
${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${LANGPAIR}.clean.${TRG}.gz}}}}
|
||||
else
|
||||
EXCLUDE_DATASETS = ${filter-out \
|
||||
${patsubst %,${ORIGINAL_DATADIR}/${PRE}/%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,${INCLUDE}},\
|
||||
${patsubst %.${LANGPAIR}.clean.${TRG}.gz,%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,\
|
||||
${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${LANGPAIR}.clean.${TRG}.gz}}}
|
||||
endif
|
||||
ORIGINAL_DATASETS_TRG = ${filter-out ${EXCLUDE_DATASETS},\
|
||||
${wildcard ${ORIGINAL_DATADIR}/${PRE}/*.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz}}
|
||||
ORIGINAL_DATASETS_SRC = ${patsubst %.${TRG}.gz,%.${PIVOT}.gz,${ORIGINAL_DATASETS_TRG}}
|
||||
|
||||
ORIGINAL_DATASRC ?= ${firstword ${ORIGINAL_DATASETS_SRC}}
|
||||
ORIGINAL_DATATRG ?= ${firstword ${ORIGINAL_DATASETS_TRG}}
|
||||
@ -166,6 +194,10 @@ print-all-data:
|
||||
@echo "${ALL_TRANSLATED_LATEST_SRC}"
|
||||
@echo "${ALL_TRANSLATED_LATEST_TRG}"
|
||||
|
||||
print-excludes:
|
||||
@echo ${patsubst %,%.${ORIGINAL_LANGPAIR}.clean.${TRG}.gz,${INCLUDE}}
|
||||
@echo "${EXCLUDE_PATTERN}"
|
||||
@echo "${EXCLUDE_DATASETS}"
|
||||
|
||||
|
||||
## fetch the latest model
|
||||
|
Loading…
Reference in New Issue
Block a user