added some size info to train data README

This commit is contained in:
Joerg Tiedemann 2020-05-17 01:21:57 +03:00
parent 198c779e91
commit 1246bcd271
6 changed files with 125 additions and 11 deletions

View File

@ -236,6 +236,26 @@ all-and-backtranslate-allwikis: ${WORKDIR}/config.mk
done
done
.PHONY: all-and-backtranslate-allwikiparts
all-and-backtranslate-allwikiparts: ${WORKDIR}/config.mk
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MAX_SENTENCES=${shell ${TRAIN_SRC}.clean.${PRE_SRC}.gz | head -1000000 | wc -l} \
MODELHOME=${MODELDIR} \
translate-all-wikiparts; \
fi \
done
done
## train a model with backtranslations of wikipedia data
## (1) train a model in the opposite direction and backtranslate wikipedia data
## (2) train a model with backtranslated data
@ -250,6 +270,12 @@ all-with-bt-all:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikis
${MAKE} all-bt
## and now with all parts of all wikis
.PHONY: all-with-bt-allparts
all-with-bt-allparts:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikiparts
${MAKE} all-bt

View File

@ -101,13 +101,24 @@ UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${
all: index.html
${MAKE} ${WIKI_LATEST_SRC} ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_SRC}
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate"; \
${MAKE} WIKISOURCE=$$w translate; \
fi \
done
translate-all-wikiparts: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
@ -117,7 +128,18 @@ translate-all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
fi \
done
translate-all-wikis-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
echo "${MAKE} WIKISOURCE=$$w translate"; \
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
fi \
done
translate-all-wikiparts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
@ -418,7 +440,8 @@ extract-text: ${WIKI_TXT}
extract-doc: ${WIKI_DOC}
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
prepare-data: ${WIKI_PRE}
translate: ${WIKI_LATEST_SRC} ${WIKI_LATEST_TRG}
translate: ${WIKI_LATEST_TRG}
${MAKE} ${WIKI_LATEST_SRC}
## translate all parts
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml

View File

@ -363,7 +363,7 @@ ${WORKDIR}/config.mk:
if [ $$s -gt 10000000 ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 10 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_SUBMIT = -multipu" >> $@; \
echo "GPUJOB_SUBMIT = -multigpu" >> $@; \
elif [ $$s -gt 1000000 ]; then \
echo "# ${LANGPAIRSTR} training data bigger than 1 million" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \

View File

@ -15,6 +15,23 @@ THREADS ?= ${HPC_CORES}
SKIP_LANGPAIRS ?= "nothing"
## training data size (generates count if not in README.md)
TRAINDATA_SIZE = ${shell \
if [ -e ${WORKDIR}/train/README.md ]; then \
if [ `grep 'total size (${DATASET}):' ${WORKDIR}/train/README.md | wc -l` -gt 0 ]; then \
grep 'total size (${DATASET}):' ${WORKDIR}/train/README.md | cut -f2 -d':' ; \
elif [ -e ${TRAIN_SRC}.clean.${PRE_SRC}.gz ]; then \
echo -n '* total size (${DATASET}): ' >> ${WORKDIR}/train/README.md; \
zcat ${TRAIN_SRC}.clean.${PRE_SRC}.gz | wc -l >> ${WORKDIR}/train/README.md; \
grep 'total size (${DATASET}):' ${WORKDIR}/train/README.md | cut -f2 -d':' ; \
fi \
elif [ -e ${TRAIN_SRC}.clean.${PRE_SRC}.gz ]; then \
echo '\# ${DATASET}' >> ${WORKDIR}/train/README.md; \
echo '' >> ${WORKDIR}/train/README.md; \
echo -n '* total size (${DATASET}): ' >> ${WORKDIR}/train/README.md; \
zcat ${TRAIN_SRC}.clean.${PRE_SRC}.gz | wc -l >> ${WORKDIR}/train/README.md; \
grep 'total size (${DATASET}):' ${WORKDIR}/train/README.md | cut -f2 -d':' ; \
fi }
## look for cleanup scripts and put them into a pipe
## they should be executable and should basically read STDIN and print to STDOUT
@ -39,6 +56,10 @@ else
BACKTRANS_DIR = backtranslate/${TRG}-${SRC}
endif
## TODO: make it possible to select only parts of the BT data
## ---> use TRAINDATA_SIZE to take max the same amount of all shuffled BT data
BACKTRANS_SRC = ${sort ${wildcard ${BACKTRANS_DIR}/*.${SRCEXT}.gz}}
BACKTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${BACKTRANS_SRC}}
@ -322,10 +343,12 @@ add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
echo ${CLEAN_TRAIN_TRG}; \
fi
ifneq (${CLEAN_TRAIN_SRC},)
echo "* ${LANGPAIR}: ${TRAINSET}" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
echo -n "* ${LANGPAIR}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo "* ${LANGPAIR} backtranslations: ${basename ${basename ${dir ${BACKTRANS_SRC}}}}" \
echo -n "* ${LANGPAIR} backtranslations: ${basename ${basename ${dir ${BACKTRANS_SRC}}}}, " \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
@ -345,10 +368,12 @@ endif
## extract training data but keep some heldout data for each dataset
add-to-local-train-and-heldout-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
ifneq (${CLEAN_TRAIN_SRC},)
echo "* ${LANGPAIR}: ${TRAINSET}" >> ${dir ${LOCAL_TRAIN_SRC}}README.md
echo -n "* ${LANGPAIR}: ${TRAINSET}, " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${CLEAN_TRAIN_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
ifeq (${USE_BACKTRANS},1)
echo "* ${LANGPAIR} backtranslations: ${basename ${basename ${BACKTRANS_SRC}}}" \
echo -n "* ${LANGPAIR} backtranslations: ${basename ${basename ${BACKTRANS_SRC}}}, " \
>> ${dir ${LOCAL_TRAIN_SRC}}README.md
zcat ${BACKTRANS_SRC} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
endif
mkdir -p ${HELDOUT_DIR}/${SRC}-${TRG}
ifneq (${words ${TRGLANGS}},1)
@ -421,8 +446,8 @@ ${DEV_SRC}.shuffled.gz:
done \
done
paste ${DEV_SRC} ${DEV_TRG} | ${SHUFFLE} | gzip -c > $@
# paste ${DEV_SRC} ${DEV_TRG} | shuf | gzip -c > $@
echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md
zcat $@ | wc -l >> ${dir ${DEV_SRC}}README.md
@ -486,7 +511,8 @@ ${DEV_TRG}: ${DEV_SRC}
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
ifneq (${CLEAN_DEV_SRC},)
echo "* ${LANGPAIR}: ${DEVSET}" >> ${dir ${DEV_SRC}}README.md
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
zcat ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
zcat ${CLEAN_DEV_SRC} |\
@ -802,6 +828,9 @@ MAX_NR_TOKENS = 250
rm -f $<.${SRCEXT} $<.${TRGEXT}
mv $@.${SRCEXT} $@
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
echo -n "* total size (${DATASET}): " >> ${dir $@}README.md
cat $@ | wc -l >> ${dir $@}README.md
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
@echo "done!"
@ -1072,7 +1101,7 @@ endif
## python-based char-counter (seems to be the fastest version)
%.charfreq: %
head -10000000 $< > $<.10m
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
rm -f $<.10m
## slow version

View File

@ -13,3 +13,21 @@
|-----------------------|-------|-------|
| JW300.fi.sq | 32.0 | 0.535 |
# opus-2020-05-16.zip
* dataset: opus
* model: transformer-align
* source language(s): fi
* target language(s): sq
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-sq/opus-2020-05-16.zip)
* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-sq/opus-2020-05-16.test.txt)
* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-sq/opus-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.fi.sq | 30.7 | 0.533 |

View File

@ -28,3 +28,21 @@
|-----------------------|-------|-------|
| Tatoeba.fi.tr | 31.6 | 0.619 |
# opus+bt-2020-05-16.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): fi
* target language(s): tr
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+bt-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-tr/opus+bt-2020-05-16.zip)
* test set translations: [opus+bt-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-tr/opus+bt-2020-05-16.test.txt)
* test set scores: [opus+bt-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-tr/opus+bt-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.fi.tr | 32.1 | 0.619 |