tatoeba challenge model scripts updated

This commit is contained in:
Joerg Tiedemann 2020-06-06 20:49:54 +03:00
parent edaf361803
commit 6cb9959e82
8 changed files with 241 additions and 67 deletions

View File

@ -15,7 +15,8 @@ make SRCLANGS=afr TRGLANGS=epo tatoeba-train
make SRCLANGS=afr TRGLANGS=epo tatoeba-eval
```
## Start job for a single language pair in both directions
## Start job for a single language pair
For example, for Afrikaans-Esperanto:
@ -23,6 +24,13 @@ For example, for Afrikaans-Esperanto:
make SRCLANGS=afr TRGLANGS=epo tatoeba-job
```
You can also initiate jobs for transation models in both directions:
```
make SRCLANGS=afr TRGLANGS=epo tatoeba-bidirectional-job
```
## Start jobs for all pairs in an entire subset

View File

@ -32,12 +32,12 @@ BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
${BPESRCMODEL}:
${MAKE} ${LOCAL_TRAIN_SRC}
mkdir -p ${dir $@}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
else
ifeq (${USE_TARGET_LABELS},1)
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} > ${LOCAL_TRAIN_SRC}.text
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC}.text > $@
rm -f ${LOCAL_TRAIN_SRC}.text
else
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
endif
@ -50,15 +50,16 @@ ${BPETRGMODEL}:
python3 ${SNMTPATH}/learn_bpe.py -s $(TRGBPESIZE) < ${LOCAL_TRAIN_TRG} > $@
#
%.src.bpe${SRCBPESIZE:000=}k: %.src ${BPESRCMODEL}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
else
ifeq (${USE_TARGET_LABELS},1)
cut -f1 -d ' ' $< > $<.labels
cut -f2- -d ' ' $< > $<.txt
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $<.txt > $@.txt
paste -d ' ' $<.labels $@.txt > $@
rm -f $<.labels $<.txt $@.txt
else
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
endif
%.trg.bpe${TRGBPESIZE:000=}k: %.trg ${BPETRGMODEL}

View File

@ -29,6 +29,19 @@ TRGLANGS ?= fi
SRC ?= ${firstword ${SRCLANGS}}
TRG ?= ${lastword ${TRGLANGS}}
## set SHUFFLE_DATA if you want to shuffle data for
## each language pair to be added to the training data
## --> especially useful in connection with FIT_DATA_SIZE
##
# SHUFFLE_DATA = 1
## set FIT_DATA_SIZE to a specific value to fit the training data
## to a certain number of lines for each language pair in the collection
## --> especially useful for multilingual models for balancing the
## the size for each language pair
## the script does both, over- and undersampling
##
# FIT_DATA_SIZE = 100000
# sorted languages and langpair used to match resources in OPUS
@ -56,6 +69,12 @@ else
TRGEXT = ${TRG}
endif
## set a flag to use target language labels
## in multi-target models
ifneq (${words ${TRGLANGS}},1)
USE_TARGET_LABELS = 1
endif
## set additional argument options for opus_read (if it is used)
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
@ -374,10 +393,17 @@ LARGEST_TRAINSIZE = 10000000
${WORKDIR}/config.mk:
mkdir -p ${dir $@}
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq; \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
S=`cat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq | wc -l`; \
T=`cat ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq | wc -l`; \
else \
${MAKE} ${LOCAL_TRAIN_SRC}; \
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq; \
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \
T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
fi; \
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
@ -421,5 +447,14 @@ ${WORKDIR}/config.mk:
echo "DEVMINSIZE = 100" >> $@; \
else \
echo "${LANGPAIRSTR} too small"; \
fi; \
if [ -e $@ ]; then \
if [ $$S -gt 1000 ]; then \
echo "SRCBPESIZE = 32000" >> $@; \
fi; \
if [ $$T -gt 1000 ]; then \
echo "TRGBPESIZE = 32000" >> $@; \
fi; \
fi

View File

@ -90,7 +90,6 @@ DATA_SRC := ${sort ${CLEAN_TRAIN_SRC} ${CLEAN_DEV_SRC} ${CLEAN_TEST_SRC}}
DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_DEV_TRG} ${CLEAN_TEST_TRG}}
##-------------------------------------------------------------
## make data in reverse direction without re-doing word alignment etc ...
## ---> this is dangerous when things run in parallel
@ -334,11 +333,10 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
######################################
# multiple target languages?
# --> add language labels
# do we need to add target language labels?
######################################
ifneq (${words ${TRGLANGS}},1)
echo "more than one target language";
ifeq (${USE_TARGET_LABELS},1)
echo "set target language labels";
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.src
else
@ -346,17 +344,23 @@ else
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.src
endif
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.trg
endif
######################################
# FIT_DATA_SIZE is set?
# --> shuffle data and fit the
# data sets to a specific size
# SHUFFLE_DATA is set?
# --> shuffle data for each langpair
# --> do this when FIT_DATA_SIZE is set!
######################################
ifdef FIT_DATA_SIZE
ifdef SHUFFLE_DATA
paste ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.src
cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.trg
rm -f ${LOCAL_TRAIN_SRC}.shuffled
endif
######################################
# FIT_DATA_SIZE is set?
# --> fit data to speciic size
# --> under/over sampling!
######################################
ifdef FIT_DATA_SIZE
scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_SRC}.src >> ${LOCAL_TRAIN_SRC}
scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
else
@ -364,7 +368,7 @@ else
cat ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
endif
rm -f ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg
endif
@ -467,7 +471,7 @@ add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
ifneq (${wildcard ${CLEAN_DEV_SRC}},)
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
${GZIP} -cd < ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
ifneq (${words ${TRGLANGS}},1)
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
${GZIP} -cd < ${CLEAN_DEV_SRC} |\
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
@ -538,7 +542,7 @@ ${TEST_TRG}: ${TEST_SRC}
add-to-test-data: ${CLEAN_TEST_SRC}
ifneq (${wildcard ${CLEAN_TEST_SRC}},)
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
ifneq (${words ${TRGLANGS}},1)
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
${GZIP} -cd < ${CLEAN_TEST_SRC} |\
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}

View File

@ -133,3 +133,4 @@ ifeq (${SHUFFLE},)
endif
GZIP := ${shell which pigz 2>/dev/null}
GZIP ?= gzip
ZCAT = ${GZIP} -cd <

View File

@ -12,9 +12,11 @@
# make SRCLANGS=afr TRGLANGS=epo tatoeba-eval
#
#
# start job for a single language pair in both directions, for example:
# start job for a single language pair in one direction or
# in both directions, for example:
#
# make SRCLANGS=afr TRGLANGS=epo tatoeba-job
# make SRCLANGS=afr TRGLANGS=epo tatoeba-bidirectional-job
#
#
# start jobs for all pairs in an entire subset:
@ -49,12 +51,18 @@ print-langs:
tatoeba-job:
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
tatoeba-bidirectional-job:
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
ifneq (${SRCLANGS},${TRGLANGS})
${MAKE} reverse-data-tatoeba
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" tatoeba-prepare
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-job-tatoeba
endif
tatoeba-prepare:
${MAKE} clean-data-tatoeba
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba
@ -70,7 +78,7 @@ tatoeba-subset-%: tatoeba-%.md
for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
s=`echo $$l | cut -f1 -d '-'`; \
t=`echo $$l | cut -f2 -d '-'`; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-job; \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-bidirectional-job; \
done
## set FIT_DATA_SIZE for under/over-sampling of data!
@ -103,11 +111,16 @@ tatoeba-%.md:
tttt:
echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
## generic target for tatoeba challenge jobs
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo $<
# %-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
${MAKE} TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
TESTSET=Tatoeba-test \
@ -119,10 +132,49 @@ tatoeba-%.md:
TESTSIZE=10000 \
DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \
SRCLANGS="${shell cat $(word 1,$^)}" \
TRGLANGS="${shell cat $(word 2,$^)}" \
LANGPAIRSTR=${LANGPAIRSTR} \
EMAIL= \
${@:-tatoeba=}
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ "$$s" \< "$$t" ]; then \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
fi \
done \
done
if [ ! -e $@ ]; then \
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels ]; then \
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels \
>> $@; \
fi \
done \
done \
fi
if [ ! -e $(@:.${SRCEXT}.labels=.${TRGEXT}.labels) ]; then \
for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels ]; then \
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels \
>> $(@:.${SRCEXT}.labels=.${TRGEXT}.labels); \
fi \
done \
done \
fi
%.${LANGPAIRSTR}.clean.${SRCEXT}.labels: %.${LANGPAIRSTR}.clean.${SRCEXT}.labels
echo "done"
## don't delete those files
.SECONDARY: ${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
@ -133,46 +185,101 @@ tatoeba-%.md:
${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
BASIC_FILTERS = | perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' \
| perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' \
| $(TOKENIZER)/remove-non-printing-char.perl \
| $(TOKENIZER)/deescape-special-chars.perl
## TODO: should we add $(TOKENIZER)/replace-unicode-punctuation.perl ?
## TODO: add this? sed 's/_/ /g'
## this sed line from https://github.com/aboSamoor/polyglot/issues/71 does not seem to work
# | sed 's/[\00\01\02\03\04\05\06\07\08\0b\0e\0f\10\11\12\13\14\15\16\17\18\19\1a\1b\1c\1d\1e\1f\7f\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f]//g'
## modify language IDs in training data to adjust them to test sets
## --> fix codes for chinese
## --> take away regional codes
## --> take away script extension that may come with some codes
FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/;' \
| sed 's/\_[A-Z][A-Z]//' \
| sed 's/\-[a-z]*//'
## convert Tatoeba Challenge data into the format we need
## - move the data into the right location with the suitable name
## - create devset if not given (part of training data)
## - divide into individual language pairs
## (if there is more than one language pair in the collection)
##
## TODO: should we do some filtering like bitext-match, OPUS-filter ...
%/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz:
mkdir -p $@.d
wget -q -O $@.d/train.tar ${TATOEBA_DATA}/${LANGPAIR}.tar
tar -C $@.d -xf $@.d/train.tar
${GZIP} -c < $@.d/data/${LANGPAIR}/test.src > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz
${GZIP} -c < $@.d/data/${LANGPAIR}/test.trg > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
${GZIP} -cd < $@.d/data/${LANGPAIR}/train.src.gz ${BASIC_FILTERS} > $@.1
${GZIP} -cd < $@.d/data/${LANGPAIR}/train.trg.gz ${BASIC_FILTERS} > $@.2
paste $@.1 $@.2 | scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
rm -f $@.1 $@.2
mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}
mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}
mv $@.d/data/${LANGPAIR}/test.id ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id
if [ -e $@.d/data/${LANGPAIR}/dev.src ]; then \
${GZIP} -c < $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
${GZIP} -c < $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
cut -f1 $@.bitext | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
cut -f2 $@.bitext | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
mv $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
mv $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
mv $@.d/data/${LANGPAIR}/dev.id > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
else \
echo "no devdata available - get top 1000 from training data!"; \
cut -f1 $@.bitext | head -1000 | ${GZIP} -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
cut -f2 $@.bitext | head -1000 | ${GZIP} -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
cut -f1 $@.bitext | tail -n +1001 | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
cut -f2 $@.bitext | tail -n +1001 | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
fi
rm -f $@.bitext
cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
rm -f $@.d/data/${LANGPAIR}/*
rmdir $@.d/data/${LANGPAIR}
rmdir $@.d/data
rm -f $@.d/train.tar
rmdir $@.d
#######################################
# make data sets for individual
# language pairs from the Tatoeba data
#######################################
for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
if [ "$$s" \< "$$t" ]; then \
echo "extract $$s-$$t data"; \
for d in dev test train; do \
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
fi; \
rm -f ${dir $@}Tatoeba-$$d.$$s-$$t; \
done \
else \
echo "extract $$t-$$s data"; \
for d in dev test train; do \
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
fi; \
rm -f ${dir $@}Tatoeba-$$d.$$t-$$s; \
done \
fi \
done \
done
#######################################
# finally, compress the big datafiles
# and cleanup
#######################################
for d in dev test train; do \
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
else \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
fi; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
done
%/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo "done!"

View File

@ -27,10 +27,10 @@ GENERATE_SPM_VOC = 0
${SPMSRCMODEL}:
${MAKE} ${LOCAL_TRAIN_SRC}
mkdir -p ${dir $@}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
else
ifeq (${USE_TARGET_LABELS},1)
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} | grep . | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
else
grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
endif
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
@ -175,6 +175,12 @@ endif
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
rm -f $<.10m
%.charfreq: %.gz
${GZIP} -cd < $< | head -10000000 > $<.10m
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
rm -f $<.10m
## slow version
%.charfreq2: %
head -10000000 $< |\
@ -189,14 +195,14 @@ endif
## see https://github.com/google/sentencepiece#c-from-source
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
else
ifeq (${USE_TARGET_LABELS},1)
cut -f1 -d ' ' $< > $<.labels
cut -f2- -d ' ' $< > $<.txt
${SPM_HOME}/spm_encode --model $(word 2,$^) < $<.txt > $@.txt
paste -d ' ' $<.labels $@.txt > $@
rm -f $<.labels $<.txt $@.txt
else
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
endif
%.trg.spm${TRGBPESIZE:000=}k: %.trg ${SPMTRGMODEL}

View File

@ -14,6 +14,8 @@ parser.add_argument('-t','--trglang','--target-language', type=str, default='de'
help='accepted language')
parser.add_argument('-l','--supported','--supported-languages', action='store_true',
help='list all supported languages')
parser.add_argument('-f','--print-flag','--print-accept-flag', action='store_true',
help='print only a flag about acceptance')
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
help='show whether languages are supported')
parser.add_argument('-v','--verbose', action='store_true',
@ -68,15 +70,21 @@ if args.checklang:
if not supported_language(args.srclang):
if len(args.srclang) == 3:
try:
langid = languages.get(part3=args.srclang).part1
if langid:
except:
print("language code not found: " + args.srclang, file=sys.stderr, flush=True)
else:
args.srclang = langid
print("set srclang to " + args.srclang, file=sys.stderr, flush=True)
if not supported_language(args.trglang):
if len(args.trglang) == 3:
try:
langid = languages.get(part3=args.trglang).part1
if langid:
except:
print("language code not found: " + args.trglang, file=sys.stderr, flush=True)
else:
args.trglang = langid
print("set trglang to " + args.trglang, file=sys.stderr, flush=True)
@ -102,9 +110,13 @@ else:
for line in sys.stdin:
# line = ''.join(x for x in line if x.isprintable())
text = line.rstrip().split("\t")
accept = '0'
if len(text) > 1:
if text[0] and text[1]:
if is_accepted(text[0],srcaccept,srcreject):
if is_accepted(text[1],trgaccept,trgreject):
accept = '1'
if not args.print_flag:
print(text[0] + "\t" + text[1])
if args.print_flag:
print(accept)