mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 12:32:24 +03:00
tatoeba challenge model scripts updated
This commit is contained in:
parent
edaf361803
commit
6cb9959e82
@ -15,7 +15,8 @@ make SRCLANGS=afr TRGLANGS=epo tatoeba-train
|
||||
make SRCLANGS=afr TRGLANGS=epo tatoeba-eval
|
||||
```
|
||||
|
||||
## Start job for a single language pair in both directions
|
||||
|
||||
## Start job for a single language pair
|
||||
|
||||
For example, for Afrikaans-Esperanto:
|
||||
|
||||
@ -23,6 +24,13 @@ For example, for Afrikaans-Esperanto:
|
||||
make SRCLANGS=afr TRGLANGS=epo tatoeba-job
|
||||
```
|
||||
|
||||
You can also initiate jobs for transation models in both directions:
|
||||
|
||||
```
|
||||
make SRCLANGS=afr TRGLANGS=epo tatoeba-bidirectional-job
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Start jobs for all pairs in an entire subset
|
||||
|
||||
|
13
lib/bpe.mk
13
lib/bpe.mk
@ -32,12 +32,12 @@ BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
${BPESRCMODEL}:
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}
|
||||
mkdir -p ${dir $@}
|
||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
|
||||
else
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} > ${LOCAL_TRAIN_SRC}.text
|
||||
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC}.text > $@
|
||||
rm -f ${LOCAL_TRAIN_SRC}.text
|
||||
else
|
||||
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
|
||||
endif
|
||||
|
||||
|
||||
@ -50,15 +50,16 @@ ${BPETRGMODEL}:
|
||||
python3 ${SNMTPATH}/learn_bpe.py -s $(TRGBPESIZE) < ${LOCAL_TRAIN_TRG} > $@
|
||||
|
||||
|
||||
#
|
||||
%.src.bpe${SRCBPESIZE:000=}k: %.src ${BPESRCMODEL}
|
||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
|
||||
else
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
cut -f1 -d ' ' $< > $<.labels
|
||||
cut -f2- -d ' ' $< > $<.txt
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $<.txt > $@.txt
|
||||
paste -d ' ' $<.labels $@.txt > $@
|
||||
rm -f $<.labels $<.txt $@.txt
|
||||
else
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
|
||||
endif
|
||||
|
||||
%.trg.bpe${TRGBPESIZE:000=}k: %.trg ${BPETRGMODEL}
|
||||
|
@ -29,6 +29,19 @@ TRGLANGS ?= fi
|
||||
SRC ?= ${firstword ${SRCLANGS}}
|
||||
TRG ?= ${lastword ${TRGLANGS}}
|
||||
|
||||
## set SHUFFLE_DATA if you want to shuffle data for
|
||||
## each language pair to be added to the training data
|
||||
## --> especially useful in connection with FIT_DATA_SIZE
|
||||
##
|
||||
# SHUFFLE_DATA = 1
|
||||
|
||||
## set FIT_DATA_SIZE to a specific value to fit the training data
|
||||
## to a certain number of lines for each language pair in the collection
|
||||
## --> especially useful for multilingual models for balancing the
|
||||
## the size for each language pair
|
||||
## the script does both, over- and undersampling
|
||||
##
|
||||
# FIT_DATA_SIZE = 100000
|
||||
|
||||
|
||||
# sorted languages and langpair used to match resources in OPUS
|
||||
@ -56,6 +69,12 @@ else
|
||||
TRGEXT = ${TRG}
|
||||
endif
|
||||
|
||||
## set a flag to use target language labels
|
||||
## in multi-target models
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
USE_TARGET_LABELS = 1
|
||||
endif
|
||||
|
||||
|
||||
## set additional argument options for opus_read (if it is used)
|
||||
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
|
||||
@ -374,10 +393,17 @@ LARGEST_TRAINSIZE = 10000000
|
||||
${WORKDIR}/config.mk:
|
||||
mkdir -p ${dir $@}
|
||||
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
|
||||
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
|
||||
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq; \
|
||||
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
S=`cat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq | wc -l`; \
|
||||
T=`cat ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq | wc -l`; \
|
||||
else \
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}; \
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq; \
|
||||
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
|
||||
S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \
|
||||
T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \
|
||||
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
|
||||
fi; \
|
||||
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
|
||||
@ -421,5 +447,14 @@ ${WORKDIR}/config.mk:
|
||||
echo "DEVMINSIZE = 100" >> $@; \
|
||||
else \
|
||||
echo "${LANGPAIRSTR} too small"; \
|
||||
fi; \
|
||||
if [ -e $@ ]; then \
|
||||
if [ $$S -gt 1000 ]; then \
|
||||
echo "SRCBPESIZE = 32000" >> $@; \
|
||||
fi; \
|
||||
if [ $$T -gt 1000 ]; then \
|
||||
echo "TRGBPESIZE = 32000" >> $@; \
|
||||
fi; \
|
||||
fi
|
||||
|
||||
|
||||
|
42
lib/data.mk
42
lib/data.mk
@ -76,21 +76,20 @@ endif
|
||||
## data sets (train/dev/test)
|
||||
##-------------------------------------------------------------
|
||||
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \
|
||||
${BACKTRANS_SRC} ${PIVOTING_SRC}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
|
||||
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}}
|
||||
CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}}
|
||||
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}}
|
||||
CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}}
|
||||
|
||||
CLEAN_TEST_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TESTSET}}
|
||||
CLEAN_TEST_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TEST_SRC}}
|
||||
CLEAN_TEST_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TESTSET}}
|
||||
CLEAN_TEST_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TEST_SRC}}
|
||||
|
||||
DATA_SRC := ${sort ${CLEAN_TRAIN_SRC} ${CLEAN_DEV_SRC} ${CLEAN_TEST_SRC}}
|
||||
DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_DEV_TRG} ${CLEAN_TEST_TRG}}
|
||||
|
||||
|
||||
|
||||
##-------------------------------------------------------------
|
||||
## make data in reverse direction without re-doing word alignment etc ...
|
||||
## ---> this is dangerous when things run in parallel
|
||||
@ -334,11 +333,10 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
|
||||
echo -n "* ${SRC}-${TRG}: total size = " >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l >> ${dir ${LOCAL_TRAIN_SRC}}README.md
|
||||
######################################
|
||||
# multiple target languages?
|
||||
# --> add language labels
|
||||
# do we need to add target language labels?
|
||||
######################################
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
echo "more than one target language";
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "set target language labels";
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\
|
||||
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.src
|
||||
else
|
||||
@ -346,17 +344,23 @@ else
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.src
|
||||
endif
|
||||
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.trg
|
||||
endif
|
||||
######################################
|
||||
# FIT_DATA_SIZE is set?
|
||||
# --> shuffle data and fit the
|
||||
# data sets to a specific size
|
||||
# SHUFFLE_DATA is set?
|
||||
# --> shuffle data for each langpair
|
||||
# --> do this when FIT_DATA_SIZE is set!
|
||||
######################################
|
||||
ifdef FIT_DATA_SIZE
|
||||
ifdef SHUFFLE_DATA
|
||||
paste ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
|
||||
cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.src
|
||||
cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.trg
|
||||
rm -f ${LOCAL_TRAIN_SRC}.shuffled
|
||||
endif
|
||||
######################################
|
||||
# FIT_DATA_SIZE is set?
|
||||
# --> fit data to speciic size
|
||||
# --> under/over sampling!
|
||||
######################################
|
||||
ifdef FIT_DATA_SIZE
|
||||
scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_SRC}.src >> ${LOCAL_TRAIN_SRC}
|
||||
scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
|
||||
else
|
||||
@ -364,7 +368,7 @@ else
|
||||
cat ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
|
||||
endif
|
||||
rm -f ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg
|
||||
|
||||
endif
|
||||
|
||||
|
||||
|
||||
@ -467,7 +471,7 @@ add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
|
||||
ifneq (${wildcard ${CLEAN_DEV_SRC}},)
|
||||
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
|
||||
${GZIP} -cd < ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "more than one target language";
|
||||
${GZIP} -cd < ${CLEAN_DEV_SRC} |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
|
||||
@ -538,7 +542,7 @@ ${TEST_TRG}: ${TEST_SRC}
|
||||
add-to-test-data: ${CLEAN_TEST_SRC}
|
||||
ifneq (${wildcard ${CLEAN_TEST_SRC}},)
|
||||
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "more than one target language";
|
||||
${GZIP} -cd < ${CLEAN_TEST_SRC} |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
|
||||
|
@ -133,3 +133,4 @@ ifeq (${SHUFFLE},)
|
||||
endif
|
||||
GZIP := ${shell which pigz 2>/dev/null}
|
||||
GZIP ?= gzip
|
||||
ZCAT = ${GZIP} -cd <
|
||||
|
@ -12,9 +12,11 @@
|
||||
# make SRCLANGS=afr TRGLANGS=epo tatoeba-eval
|
||||
#
|
||||
#
|
||||
# start job for a single language pair in both directions, for example:
|
||||
# start job for a single language pair in one direction or
|
||||
# in both directions, for example:
|
||||
#
|
||||
# make SRCLANGS=afr TRGLANGS=epo tatoeba-job
|
||||
# make SRCLANGS=afr TRGLANGS=epo tatoeba-bidirectional-job
|
||||
#
|
||||
#
|
||||
# start jobs for all pairs in an entire subset:
|
||||
@ -49,12 +51,18 @@ print-langs:
|
||||
tatoeba-job:
|
||||
${MAKE} tatoeba-prepare
|
||||
${MAKE} all-job-tatoeba
|
||||
|
||||
tatoeba-bidirectional-job:
|
||||
${MAKE} tatoeba-prepare
|
||||
${MAKE} all-job-tatoeba
|
||||
ifneq (${SRCLANGS},${TRGLANGS})
|
||||
${MAKE} reverse-data-tatoeba
|
||||
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" tatoeba-prepare
|
||||
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-job-tatoeba
|
||||
endif
|
||||
|
||||
tatoeba-prepare:
|
||||
${MAKE} clean-data-tatoeba
|
||||
${MAKE} local-config-tatoeba
|
||||
${MAKE} data-tatoeba
|
||||
|
||||
@ -70,7 +78,7 @@ tatoeba-subset-%: tatoeba-%.md
|
||||
for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
|
||||
s=`echo $$l | cut -f1 -d '-'`; \
|
||||
t=`echo $$l | cut -f2 -d '-'`; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-job; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-bidirectional-job; \
|
||||
done
|
||||
|
||||
## set FIT_DATA_SIZE for under/over-sampling of data!
|
||||
@ -103,11 +111,16 @@ tatoeba-%.md:
|
||||
|
||||
|
||||
|
||||
tttt:
|
||||
echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
|
||||
echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
|
||||
|
||||
|
||||
|
||||
## generic target for tatoeba challenge jobs
|
||||
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo $<
|
||||
# %-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
|
||||
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
|
||||
${MAKE} TRAINSET=Tatoeba-train \
|
||||
DEVSET=Tatoeba-dev \
|
||||
TESTSET=Tatoeba-test \
|
||||
@ -119,10 +132,49 @@ tatoeba-%.md:
|
||||
TESTSIZE=10000 \
|
||||
DEVMINSIZE=200 \
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
SRCLANGS="${shell cat $(word 1,$^)}" \
|
||||
TRGLANGS="${shell cat $(word 2,$^)}" \
|
||||
LANGPAIRSTR=${LANGPAIRSTR} \
|
||||
EMAIL= \
|
||||
${@:-tatoeba=}
|
||||
|
||||
|
||||
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ "$$s" \< "$$t" ]; then \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
|
||||
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
if [ ! -e $@ ]; then \
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels \
|
||||
>> $@; \
|
||||
fi \
|
||||
done \
|
||||
done \
|
||||
fi
|
||||
if [ ! -e $(@:.${SRCEXT}.labels=.${TRGEXT}.labels) ]; then \
|
||||
for s in ${SRCLANGS}; do \
|
||||
for t in ${TRGLANGS}; do \
|
||||
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels ]; then \
|
||||
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels \
|
||||
>> $(@:.${SRCEXT}.labels=.${TRGEXT}.labels); \
|
||||
fi \
|
||||
done \
|
||||
done \
|
||||
fi
|
||||
|
||||
%.${LANGPAIRSTR}.clean.${SRCEXT}.labels: %.${LANGPAIRSTR}.clean.${SRCEXT}.labels
|
||||
echo "done"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## don't delete those files
|
||||
.SECONDARY: ${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
|
||||
@ -133,46 +185,101 @@ tatoeba-%.md:
|
||||
${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
|
||||
|
||||
|
||||
BASIC_FILTERS = | perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' \
|
||||
| perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' \
|
||||
| $(TOKENIZER)/remove-non-printing-char.perl \
|
||||
| $(TOKENIZER)/deescape-special-chars.perl
|
||||
|
||||
## TODO: should we add $(TOKENIZER)/replace-unicode-punctuation.perl ?
|
||||
## TODO: add this? sed 's/_/ /g'
|
||||
## this sed line from https://github.com/aboSamoor/polyglot/issues/71 does not seem to work
|
||||
# | sed 's/[\00\01\02\03\04\05\06\07\08\0b\0e\0f\10\11\12\13\14\15\16\17\18\19\1a\1b\1c\1d\1e\1f\7f\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f]//g'
|
||||
|
||||
## modify language IDs in training data to adjust them to test sets
|
||||
## --> fix codes for chinese
|
||||
## --> take away regional codes
|
||||
## --> take away script extension that may come with some codes
|
||||
FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/;' \
|
||||
| sed 's/\_[A-Z][A-Z]//' \
|
||||
| sed 's/\-[a-z]*//'
|
||||
|
||||
## convert Tatoeba Challenge data into the format we need
|
||||
## - move the data into the right location with the suitable name
|
||||
## - create devset if not given (part of training data)
|
||||
## - divide into individual language pairs
|
||||
## (if there is more than one language pair in the collection)
|
||||
##
|
||||
## TODO: should we do some filtering like bitext-match, OPUS-filter ...
|
||||
%/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz:
|
||||
mkdir -p $@.d
|
||||
wget -q -O $@.d/train.tar ${TATOEBA_DATA}/${LANGPAIR}.tar
|
||||
tar -C $@.d -xf $@.d/train.tar
|
||||
${GZIP} -c < $@.d/data/${LANGPAIR}/test.src > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
${GZIP} -c < $@.d/data/${LANGPAIR}/test.trg > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
|
||||
${GZIP} -cd < $@.d/data/${LANGPAIR}/train.src.gz ${BASIC_FILTERS} > $@.1
|
||||
${GZIP} -cd < $@.d/data/${LANGPAIR}/train.trg.gz ${BASIC_FILTERS} > $@.2
|
||||
paste $@.1 $@.2 | scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||
rm -f $@.1 $@.2
|
||||
mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}
|
||||
mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}
|
||||
mv $@.d/data/${LANGPAIR}/test.id ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id
|
||||
if [ -e $@.d/data/${LANGPAIR}/dev.src ]; then \
|
||||
${GZIP} -c < $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
${GZIP} -c < $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
cut -f1 $@.bitext | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
cut -f2 $@.bitext | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
mv $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
mv $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
mv $@.d/data/${LANGPAIR}/dev.id > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
|
||||
else \
|
||||
echo "no devdata available - get top 1000 from training data!"; \
|
||||
cut -f1 $@.bitext | head -1000 | ${GZIP} -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
cut -f2 $@.bitext | head -1000 | ${GZIP} -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
cut -f1 $@.bitext | tail -n +1001 | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
|
||||
cut -f2 $@.bitext | tail -n +1001 | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
|
||||
fi
|
||||
rm -f $@.bitext
|
||||
cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
|
||||
cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
|
||||
rm -f $@.d/data/${LANGPAIR}/*
|
||||
rmdir $@.d/data/${LANGPAIR}
|
||||
rmdir $@.d/data
|
||||
rm -f $@.d/train.tar
|
||||
rmdir $@.d
|
||||
#######################################
|
||||
# make data sets for individual
|
||||
# language pairs from the Tatoeba data
|
||||
#######################################
|
||||
for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
|
||||
for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
|
||||
if [ "$$s" \< "$$t" ]; then \
|
||||
echo "extract $$s-$$t data"; \
|
||||
for d in dev test train; do \
|
||||
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
|
||||
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
|
||||
if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
|
||||
cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
|
||||
cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
|
||||
fi; \
|
||||
rm -f ${dir $@}Tatoeba-$$d.$$s-$$t; \
|
||||
done \
|
||||
else \
|
||||
echo "extract $$t-$$s data"; \
|
||||
for d in dev test train; do \
|
||||
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
|
||||
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
|
||||
if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
|
||||
cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
|
||||
cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
|
||||
fi; \
|
||||
rm -f ${dir $@}Tatoeba-$$d.$$t-$$s; \
|
||||
done \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
#######################################
|
||||
# finally, compress the big datafiles
|
||||
# and cleanup
|
||||
#######################################
|
||||
for d in dev test train; do \
|
||||
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
|
||||
${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
else \
|
||||
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
fi; \
|
||||
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
|
||||
done
|
||||
|
||||
|
||||
%/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
|
||||
echo "done!"
|
||||
|
@ -27,10 +27,10 @@ GENERATE_SPM_VOC = 0
|
||||
${SPMSRCMODEL}:
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}
|
||||
mkdir -p ${dir $@}
|
||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||
grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
|
||||
else
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} | grep . | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
|
||||
else
|
||||
grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
|
||||
endif
|
||||
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
|
||||
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
||||
@ -175,6 +175,12 @@ endif
|
||||
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
||||
rm -f $<.10m
|
||||
|
||||
%.charfreq: %.gz
|
||||
${GZIP} -cd < $< | head -10000000 > $<.10m
|
||||
-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
||||
rm -f $<.10m
|
||||
|
||||
|
||||
## slow version
|
||||
%.charfreq2: %
|
||||
head -10000000 $< |\
|
||||
@ -189,14 +195,14 @@ endif
|
||||
## see https://github.com/google/sentencepiece#c-from-source
|
||||
|
||||
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
||||
else
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
cut -f1 -d ' ' $< > $<.labels
|
||||
cut -f2- -d ' ' $< > $<.txt
|
||||
${SPM_HOME}/spm_encode --model $(word 2,$^) < $<.txt > $@.txt
|
||||
paste -d ' ' $<.labels $@.txt > $@
|
||||
rm -f $<.labels $<.txt $@.txt
|
||||
else
|
||||
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
||||
endif
|
||||
|
||||
%.trg.spm${TRGBPESIZE:000=}k: %.trg ${SPMTRGMODEL}
|
||||
|
@ -14,6 +14,8 @@ parser.add_argument('-t','--trglang','--target-language', type=str, default='de'
|
||||
help='accepted language')
|
||||
parser.add_argument('-l','--supported','--supported-languages', action='store_true',
|
||||
help='list all supported languages')
|
||||
parser.add_argument('-f','--print-flag','--print-accept-flag', action='store_true',
|
||||
help='print only a flag about acceptance')
|
||||
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
|
||||
help='show whether languages are supported')
|
||||
parser.add_argument('-v','--verbose', action='store_true',
|
||||
@ -68,15 +70,21 @@ if args.checklang:
|
||||
|
||||
if not supported_language(args.srclang):
|
||||
if len(args.srclang) == 3:
|
||||
langid = languages.get(part3=args.srclang).part1
|
||||
if langid:
|
||||
try:
|
||||
langid = languages.get(part3=args.srclang).part1
|
||||
except:
|
||||
print("language code not found: " + args.srclang, file=sys.stderr, flush=True)
|
||||
else:
|
||||
args.srclang = langid
|
||||
print("set srclang to " + args.srclang, file=sys.stderr, flush=True)
|
||||
|
||||
if not supported_language(args.trglang):
|
||||
if len(args.trglang) == 3:
|
||||
langid = languages.get(part3=args.trglang).part1
|
||||
if langid:
|
||||
try:
|
||||
langid = languages.get(part3=args.trglang).part1
|
||||
except:
|
||||
print("language code not found: " + args.trglang, file=sys.stderr, flush=True)
|
||||
else:
|
||||
args.trglang = langid
|
||||
print("set trglang to " + args.trglang, file=sys.stderr, flush=True)
|
||||
|
||||
@ -102,9 +110,13 @@ else:
|
||||
for line in sys.stdin:
|
||||
# line = ''.join(x for x in line if x.isprintable())
|
||||
text = line.rstrip().split("\t")
|
||||
accept = '0'
|
||||
if len(text) > 1:
|
||||
if text[0] and text[1]:
|
||||
if is_accepted(text[0],srcaccept,srcreject):
|
||||
if is_accepted(text[1],trgaccept,trgreject):
|
||||
print(text[0] + "\t" + text[1])
|
||||
|
||||
accept = '1'
|
||||
if not args.print_flag:
|
||||
print(text[0] + "\t" + text[1])
|
||||
if args.print_flag:
|
||||
print(accept)
|
||||
|
Loading…
Reference in New Issue
Block a user