mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
fixed tatoeba model scripts
This commit is contained in:
parent
e07eb14984
commit
035cca7c1a
@ -265,7 +265,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
|
||||
## TODO: this causes to frequently redo the same data over and over again, does it?
|
||||
##
|
||||
# .INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
|
||||
|
||||
ifeq (${USE_REST_DEVDATA},1)
|
||||
LOCAL_TRAINDATA_DEPENDENCIES = ${DEV_SRC} ${DEV_TRG}
|
||||
|
@ -71,7 +71,8 @@ tatoeba-train:
|
||||
tatoeba-eval:
|
||||
${MAKE} compare-tatoeba
|
||||
|
||||
tatoeba-step0: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
tatoeba-step0: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
|
||||
tatoeba-step1: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
|
||||
|
||||
## run all language pairs for a given subset
|
||||
@ -129,8 +130,8 @@ tatoeba-%.md:
|
||||
TESTSIZE=10000 \
|
||||
DEVMINSIZE=200 \
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
SRCLANGS="${shell cat $<}" \
|
||||
TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels)}" \
|
||||
SRCLANGS="${shell cat $< | sed 's/ *$$//'}" \
|
||||
TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels) | sed 's/ *$$//'}" \
|
||||
LANGPAIRSTR=${LANGPAIRSTR} \
|
||||
EMAIL= \
|
||||
${@:-tatoeba=}
|
||||
@ -141,32 +142,34 @@ ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.lab
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ "$$s" \< "$$t" ]; then \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
|
||||
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$s.gz; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
if [ ! -e $@ ]; then \
|
||||
for s in ${SRCLANGS}; do \
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels \
|
||||
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$s.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$s.labels \
|
||||
>> $@.src; \
|
||||
elif [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$s.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$s.labels \
|
||||
>> $@.src; \
|
||||
fi \
|
||||
done \
|
||||
done \
|
||||
fi
|
||||
if [ ! -e $(@:.${SRCEXT}.labels=.${TRGEXT}.labels) ]; then \
|
||||
for s in ${SRCLANGS}; do \
|
||||
done
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels \
|
||||
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$t.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$t.labels \
|
||||
>> $@.trg; \
|
||||
elif [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$t.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$t.labels \
|
||||
>> $@.trg; \
|
||||
fi \
|
||||
done \
|
||||
done \
|
||||
fi
|
||||
cat $@.src | tr ' ' "\n" | sort -u | tr "\n" ' ' > $@
|
||||
cat $@.trg | tr ' ' "\n" | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.labels=.${TRGEXT}.labels)
|
||||
done
|
||||
cat $@.src | tr ' ' "\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//' > $@
|
||||
cat $@.trg | tr ' ' "\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//' > $(@:.${SRCEXT}.labels=.${TRGEXT}.labels)
|
||||
rm -f $@.src $@.trg
|
||||
|
||||
|
||||
@ -242,6 +245,13 @@ FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/
|
||||
#######################################
|
||||
# make data sets for individual
|
||||
# language pairs from the Tatoeba data
|
||||
# TODO: now we only grep for langpairs
|
||||
# available in test data
|
||||
# --> should we also include other
|
||||
# training data with a dummy label?
|
||||
# --> how do we efficiently grep for
|
||||
# everything that is not one of the langpairs?
|
||||
# grep -v and a big list of alternative lang-pairs ...
|
||||
#######################################
|
||||
for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
|
||||
for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
|
||||
|
@ -197,15 +197,16 @@ endif
|
||||
# awk '!/^$$/{a[$$0]++}END{for (i in a)print i,a[i];}' > $@
|
||||
|
||||
## python-based char-counter (seems to be the fastest version)
|
||||
## restrict to 1 million lines
|
||||
%.charfreq: %
|
||||
head -10000000 $< > $<.10m
|
||||
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
||||
rm -f $<.10m
|
||||
head -1000000 $< > $<.1m
|
||||
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.1m', 'r').read())))" > $@
|
||||
rm -f $<.1m
|
||||
|
||||
%.charfreq: %.gz
|
||||
${GZIP} -cd < $< | head -10000000 > $<.10m
|
||||
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
||||
rm -f $<.10m
|
||||
${GZIP} -cd < $< | head -1000000 > $<.1m
|
||||
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.1m', 'r').read())))" > $@
|
||||
rm -f $<.1m
|
||||
|
||||
|
||||
## slow version
|
||||
|
Loading…
Reference in New Issue
Block a user