tatoeba challenge model scripts updated

2024-11-30 12:32:24 +03:00 · 2020-06-06 20:49:54 +03:00 · 2020-06-06 20:49:54 +03:00 · 6cb9959e82
commit 6cb9959e82
parent edaf361803
8 changed files with 241 additions and 67 deletions
--- a/doc/TatoebaChallenge.md
+++ b/doc/TatoebaChallenge.md
@ -15,7 +15,8 @@ make SRCLANGS=afr TRGLANGS=epo tatoeba-train
 make SRCLANGS=afr TRGLANGS=epo tatoeba-eval
 ```

-## Start job for a single language pair in both directions
+
+## Start job for a single language pair

 For example, for Afrikaans-Esperanto:

@ -23,6 +24,13 @@ For example, for Afrikaans-Esperanto:
 make SRCLANGS=afr TRGLANGS=epo tatoeba-job
 ```

+You can also initiate jobs for transation models in both directions:
+
+```
+make SRCLANGS=afr TRGLANGS=epo tatoeba-bidirectional-job
+```
+
+

 ## Start jobs for all pairs in an entire subset

--- a/lib/bpe.mk
+++ b/lib/bpe.mk
@ -32,12 +32,12 @@ BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
 ${BPESRCMODEL}: 
 	${MAKE} ${LOCAL_TRAIN_SRC}
 	mkdir -p ${dir $@}
-ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
-	python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
-else
+ifeq (${USE_TARGET_LABELS},1)
 	cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} > ${LOCAL_TRAIN_SRC}.text
 	python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC}.text > $@
 	rm -f ${LOCAL_TRAIN_SRC}.text
+else
+	python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
 endif


@ -50,15 +50,16 @@ ${BPETRGMODEL}:
 	python3 ${SNMTPATH}/learn_bpe.py -s $(TRGBPESIZE) < ${LOCAL_TRAIN_TRG} > $@


+# 
 %.src.bpe${SRCBPESIZE:000=}k: %.src ${BPESRCMODEL}
-ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
-	python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
-else
+ifeq (${USE_TARGET_LABELS},1)
 	cut -f1 -d ' ' $< > $<.labels
 	cut -f2- -d ' ' $< > $<.txt
 	python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $<.txt > $@.txt
 	paste -d ' ' $<.labels $@.txt > $@
 	rm -f $<.labels $<.txt $@.txt
+else
+	python3 ${SNMTPATH}/apply_bpe.py -c $(word 2,$^) < $< > $@
 endif

 %.trg.bpe${TRGBPESIZE:000=}k: %.trg ${BPETRGMODEL}
--- a/lib/config.mk
+++ b/lib/config.mk
@ -29,6 +29,19 @@ TRGLANGS ?= fi
 SRC ?= ${firstword ${SRCLANGS}}
 TRG ?= ${lastword ${TRGLANGS}}

+## set SHUFFLE_DATA if you want to shuffle data for 
+## each language pair to be added to the training data
+## --> especially useful in connection with FIT_DATA_SIZE
+##  
+# SHUFFLE_DATA = 1
+
+## set FIT_DATA_SIZE to a specific value to fit the training data
+## to a certain number of lines for each language pair in the collection
+## --> especially useful for multilingual models for balancing the 
+##     the size for each language pair
+## the script does both, over- and undersampling
+##
+# FIT_DATA_SIZE = 100000


 # sorted languages and langpair used to match resources in OPUS
@ -56,6 +69,12 @@ else
  TRGEXT = ${TRG}
 endif

+## set a flag to use target language labels
+## in multi-target models
+ifneq (${words ${TRGLANGS}},1)
+  USE_TARGET_LABELS = 1
+endif
+

 ## set additional argument options for opus_read (if it is used)
 ## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
@ -374,10 +393,17 @@ LARGEST_TRAINSIZE  = 10000000
 ${WORKDIR}/config.mk:
 	mkdir -p ${dir $@}
 	if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
+	  ${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
+		  ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq; \
 	  s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
+	  S=`cat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq | wc -l`; \
+	  T=`cat ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq | wc -l`; \
 	else \
 	  ${MAKE} ${LOCAL_TRAIN_SRC}; \
+	  ${MAKE} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq; \
 	  s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
+	  S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \
+	  T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \
 	  rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}; \
 	fi; \
 	if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
@ -421,5 +447,14 @@ ${WORKDIR}/config.mk:
 	  echo "DEVMINSIZE  = 100"         >> $@; \
 	else \
 	    echo "${LANGPAIRSTR} too small"; \
+	fi; \
+	if [ -e $@ ]; then \
+	  if [ $$S -gt 1000 ]; then \
+	    echo "SRCBPESIZE  = 32000"     >> $@; \
+	  fi; \
+	  if [ $$T -gt 1000 ]; then \
+	    echo "TRGBPESIZE  = 32000"     >> $@; \
+	  fi; \
 	fi

+
--- a/lib/data.mk
+++ b/lib/data.mk
@ -90,7 +90,6 @@ DATA_SRC := ${sort ${CLEAN_TRAIN_SRC} ${CLEAN_DEV_SRC} ${CLEAN_TEST_SRC}}
 DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_DEV_TRG} ${CLEAN_TEST_TRG}}


-
 ##-------------------------------------------------------------
 ## make data in reverse direction without re-doing word alignment etc ...
 ## ---> this is dangerous when things run in parallel
@ -334,11 +333,10 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
 	echo -n "* ${SRC}-${TRG}: total size = "              >> ${dir ${LOCAL_TRAIN_SRC}}README.md
 	${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l  >> ${dir ${LOCAL_TRAIN_SRC}}README.md
 ######################################
-# multiple target languages?
-#    --> add language labels
+# do we need to add target language labels?
 ######################################
-ifneq (${words ${TRGLANGS}},1)
-	echo "more than one target language";
+ifeq (${USE_TARGET_LABELS},1)
+	echo "set target language labels";
 	${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\
 	sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.src
 else
@ -346,17 +344,23 @@ else
 	${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.src
 endif
 	${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.trg
-endif
 ######################################
-#  FIT_DATA_SIZE is set?
-#    --> shuffle data and fit the
-#        data sets to a specific size
+#  SHUFFLE_DATA is set?
+#    --> shuffle data for each langpair
+#    --> do this when FIT_DATA_SIZE is set!
 ######################################
-ifdef FIT_DATA_SIZE
+ifdef SHUFFLE_DATA
 	paste ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
 	cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.src
 	cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.trg
 	rm -f ${LOCAL_TRAIN_SRC}.shuffled
+endif
+######################################
+#  FIT_DATA_SIZE is set?
+#    --> fit data to speciic size
+#    --> under/over sampling!
+######################################
+ifdef FIT_DATA_SIZE
 	scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_SRC}.src >> ${LOCAL_TRAIN_SRC}
 	scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
 else
@ -364,7 +368,7 @@ else
 	cat ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
 endif
 	rm -f ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg
-
+endif



@ -467,7 +471,7 @@ add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
 ifneq (${wildcard ${CLEAN_DEV_SRC}},)
 	echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
 	${GZIP} -cd < ${CLEAN_DEV_SRC} | wc -l        >> ${dir ${DEV_SRC}}README.md
-ifneq (${words ${TRGLANGS}},1)
+ifeq (${USE_TARGET_LABELS},1)
 	echo "more than one target language";
 	${GZIP} -cd < ${CLEAN_DEV_SRC} |\
 	sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
@ -538,7 +542,7 @@ ${TEST_TRG}: ${TEST_SRC}
 add-to-test-data: ${CLEAN_TEST_SRC}
 ifneq (${wildcard ${CLEAN_TEST_SRC}},)
 	echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
-ifneq (${words ${TRGLANGS}},1)
+ifeq (${USE_TARGET_LABELS},1)
 	echo "more than one target language";
 	${GZIP} -cd < ${CLEAN_TEST_SRC} |\
 	sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
--- a/lib/env.mk
+++ b/lib/env.mk
@ -133,3 +133,4 @@ ifeq (${SHUFFLE},)
 endif
 GZIP := ${shell which pigz 2>/dev/null}
 GZIP ?= gzip
+ZCAT = ${GZIP} -cd <
--- a/lib/models/tatoeba.mk
+++ b/lib/models/tatoeba.mk
@ -12,9 +12,11 @@
 #   make SRCLANGS=afr TRGLANGS=epo tatoeba-eval
 #
 #
-# start job for a single language pair in both directions, for example:
+# start job for a single language pair in one direction or
+# in both directions, for example:
 #
 #   make SRCLANGS=afr TRGLANGS=epo tatoeba-job
+#   make SRCLANGS=afr TRGLANGS=epo tatoeba-bidirectional-job
 #
 #
 # start jobs for all pairs in an entire subset:
@ -49,12 +51,18 @@ print-langs:
 tatoeba-job:
 	${MAKE} tatoeba-prepare
 	${MAKE} all-job-tatoeba
+
+tatoeba-bidirectional-job:
+	${MAKE} tatoeba-prepare
+	${MAKE} all-job-tatoeba
 ifneq (${SRCLANGS},${TRGLANGS})
 	${MAKE} reverse-data-tatoeba
+	${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" tatoeba-prepare
 	${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-job-tatoeba
 endif

 tatoeba-prepare:
+	${MAKE} clean-data-tatoeba
 	${MAKE} local-config-tatoeba
 	${MAKE} data-tatoeba

@ -70,7 +78,7 @@ tatoeba-subset-%: tatoeba-%.md
 	for l in `grep '\[' $< | cut -f2 -d '[' | cut -f1 -d ']'`; do \
 	  s=`echo $$l | cut -f1 -d '-'`; \
 	  t=`echo $$l | cut -f2 -d '-'`; \
-	  ${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-job; \
+	  ${MAKE} SRCLANGS=$$s TRGLANGS=$$t tatoeba-bidirectional-job; \
 	done

 ## set FIT_DATA_SIZE for under/over-sampling of data!
@ -103,11 +111,16 @@ tatoeba-%.md:



+tttt:
+	echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
+	echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
+


 ## generic target for tatoeba challenge jobs
-%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
-	echo $<
+# %-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
+%-tatoeba: 	${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
+		${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
 	${MAKE} TRAINSET=Tatoeba-train \
 		DEVSET=Tatoeba-dev \
 		TESTSET=Tatoeba-test \
@ -119,10 +132,49 @@ tatoeba-%.md:
 		TESTSIZE=10000 \
 		DEVMINSIZE=200 \
 		WORKHOME=${TATOEBA_WORK} \
+		SRCLANGS="${shell cat $(word 1,$^)}" \
+		TRGLANGS="${shell cat $(word 2,$^)}" \
+		LANGPAIRSTR=${LANGPAIRSTR} \
 		EMAIL= \
 	${@:-tatoeba=}


+${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels:
+	for s in ${SRCLANGS}; do \
+	  for t in ${TRGLANGS}; do \
+	    if [ "$$s" \< "$$t" ]; then \
+	      ${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
+		${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
+	    fi \
+	  done \
+	done
+	if [ ! -e $@ ]; then \
+	  for s in ${SRCLANGS}; do \
+	    for t in ${TRGLANGS}; do \
+	      if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels ]; then \
+		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels \
+		>> $@; \
+	      fi \
+	    done \
+	  done \
+	fi
+	if [ ! -e $(@:.${SRCEXT}.labels=.${TRGEXT}.labels) ]; then \
+	  for s in ${SRCLANGS}; do \
+	    for t in ${TRGLANGS}; do \
+	      if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels ]; then \
+		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels \
+		>> $(@:.${SRCEXT}.labels=.${TRGEXT}.labels); \
+	      fi \
+	    done \
+	  done \
+	fi
+
+%.${LANGPAIRSTR}.clean.${SRCEXT}.labels: %.${LANGPAIRSTR}.clean.${SRCEXT}.labels
+	echo "done"
+
+
+
+

 ## don't delete those files
 .SECONDARY: ${TATOEBA_WORK}/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz \
@ -133,46 +185,101 @@ tatoeba-%.md:
 	${TATOEBA_WORK}/data/${PRE}/Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz


-BASIC_FILTERS = | perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' \
-		| perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' \
-		| $(TOKENIZER)/remove-non-printing-char.perl \
-		| $(TOKENIZER)/deescape-special-chars.perl
-
-## TODO: should we add $(TOKENIZER)/replace-unicode-punctuation.perl ?
-## TODO: add this? sed 's/_/ /g'
-## this sed line from https://github.com/aboSamoor/polyglot/issues/71 does not seem to work
-#		| sed 's/[\00\01\02\03\04\05\06\07\08\0b\0e\0f\10\11\12\13\14\15\16\17\18\19\1a\1b\1c\1d\1e\1f\7f\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f]//g'
-
+## modify language IDs in training data to adjust them to test sets
+## --> fix codes for chinese
+## --> take away regional codes
+## --> take away script extension that may come with some codes
+FIXLANGIDS = 	| sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/;' \
+		| sed 's/\_[A-Z][A-Z]//' \
+		| sed 's/\-[a-z]*//'

+## convert Tatoeba Challenge data into the format we need
+## - move the data into the right location with the suitable name
+## - create devset if not given (part of training data)
+## - divide into individual language pairs 
+##   (if there is more than one language pair in the collection)
+## 
 ## TODO: should we do some filtering like bitext-match, OPUS-filter ...
 %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz:
 	mkdir -p $@.d
 	wget -q -O $@.d/train.tar ${TATOEBA_DATA}/${LANGPAIR}.tar
 	tar -C $@.d -xf $@.d/train.tar
-	${GZIP} -c < $@.d/data/${LANGPAIR}/test.src > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}.gz
-	${GZIP} -c < $@.d/data/${LANGPAIR}/test.trg > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}.gz
-	${GZIP} -cd < $@.d/data/${LANGPAIR}/train.src.gz ${BASIC_FILTERS} > $@.1
-	${GZIP} -cd < $@.d/data/${LANGPAIR}/train.trg.gz ${BASIC_FILTERS} > $@.2
-	paste $@.1 $@.2 | scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
-	rm -f $@.1 $@.2
+	mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}
+	mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}
+	mv $@.d/data/${LANGPAIR}/test.id  ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id
 	if [ -e $@.d/data/${LANGPAIR}/dev.src ]; then \
-	  ${GZIP} -c < $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
-	  ${GZIP} -c < $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
-	  cut -f1 $@.bitext | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
-	  cut -f2 $@.bitext | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
+	  mv $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
+	  mv $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
+	  mv $@.d/data/${LANGPAIR}/dev.id > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
 	else \
 	  echo "no devdata available - get top 1000 from training data!"; \
-	  cut -f1 $@.bitext | head -1000 | ${GZIP} -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}.gz; \
-	  cut -f2 $@.bitext | head -1000 | ${GZIP} -c > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}.gz; \
-	  cut -f1 $@.bitext | tail -n +1001 | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
-	  cut -f2 $@.bitext | tail -n +1001 | ${GZIP} -c > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz  | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
+	  ${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz  | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
 	fi
-	rm -f $@.bitext
+	cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
+	cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
 	rm -f $@.d/data/${LANGPAIR}/*
 	rmdir $@.d/data/${LANGPAIR}
 	rmdir $@.d/data
 	rm -f $@.d/train.tar
 	rmdir $@.d
+#######################################
+# make data sets for individual 
+# language pairs from the Tatoeba data
+#######################################
+	for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
+	  for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
+	    if [ "$$s" \< "$$t" ]; then \
+	      echo "extract $$s-$$t data"; \
+	      for d in dev test train; do \
+	        paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
+		      ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
+		      ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
+	        grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
+	        if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
+	          cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
+	          cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
+	        fi; \
+	        rm -f ${dir $@}Tatoeba-$$d.$$s-$$t; \
+	      done \
+	    else \
+	      echo "extract $$t-$$s data"; \
+	      for d in dev test train; do \
+	        paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
+		      ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
+		      ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
+	        grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
+	        if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
+	          cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
+	          cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
+	        fi; \
+	        rm -f ${dir $@}Tatoeba-$$d.$$t-$$s; \
+	      done \
+	    fi \
+	  done \
+	done
+#######################################
+# finally, compress the big datafiles
+# and cleanup
+#######################################
+	for d in dev test train; do \
+	  if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
+	    ${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
+	    ${GZIP} -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
+	  else \
+	    rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
+	    rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
+	  fi; \
+	  rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
+	done
+

 %/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
 	echo "done!"
--- a/lib/sentencepiece.mk
+++ b/lib/sentencepiece.mk
@ -27,10 +27,10 @@ GENERATE_SPM_VOC = 0
 ${SPMSRCMODEL}: 
 	${MAKE} ${LOCAL_TRAIN_SRC}
 	mkdir -p ${dir $@}
-ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
-	grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
-else
+ifeq (${USE_TARGET_LABELS},1)
 	cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} | grep . | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
+else
+	grep . ${LOCAL_TRAIN_SRC} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
 endif
 	${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
 	if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
@ -175,6 +175,12 @@ endif
 	-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
 	rm -f $<.10m

+%.charfreq: %.gz
+	${GZIP} -cd < $< | head -10000000 > $<.10m
+	-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
+	rm -f $<.10m
+
+
 ## slow version
 %.charfreq2: %
 	head -10000000 $< |\
@ -189,14 +195,14 @@ endif
 ## see https://github.com/google/sentencepiece#c-from-source

 %.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
-ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
-	${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
-else
+ifeq (${USE_TARGET_LABELS},1)
 	cut -f1 -d ' ' $< > $<.labels
 	cut -f2- -d ' ' $< > $<.txt
 	${SPM_HOME}/spm_encode --model $(word 2,$^) < $<.txt > $@.txt
 	paste -d ' ' $<.labels $@.txt > $@
 	rm -f $<.labels $<.txt $@.txt
+else
+	${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
 endif

 %.trg.spm${TRGBPESIZE:000=}k: %.trg ${SPMTRGMODEL}
--- a/scripts/filter/bitext-match-lang.py
+++ b/scripts/filter/bitext-match-lang.py
@ -14,6 +14,8 @@ parser.add_argument('-t','--trglang','--target-language', type=str, default='de'
                   help='accepted language')
 parser.add_argument('-l','--supported','--supported-languages', action='store_true',
                   help='list all supported languages')
+parser.add_argument('-f','--print-flag','--print-accept-flag', action='store_true',
+                   help='print only a flag about acceptance')
 parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
                   help='show whether languages are supported')
 parser.add_argument('-v','--verbose', action='store_true',
@ -68,15 +70,21 @@ if args.checklang:

 if not supported_language(args.srclang):
    if len(args.srclang) == 3:
+        try:
            langid = languages.get(part3=args.srclang).part1
-        if langid:
+        except:
+            print("language code not found: " + args.srclang, file=sys.stderr, flush=True)
+        else:
            args.srclang = langid
            print("set srclang to " + args.srclang, file=sys.stderr, flush=True)

 if not supported_language(args.trglang):
    if len(args.trglang) == 3:
+        try:
            langid = languages.get(part3=args.trglang).part1
-        if langid:
+        except:
+            print("language code not found: " + args.trglang, file=sys.stderr, flush=True)
+        else:
            args.trglang = langid
            print("set trglang to " + args.trglang, file=sys.stderr, flush=True)

@ -102,9 +110,13 @@ else:
 for line in sys.stdin:
    # line = ''.join(x for x in line if x.isprintable())
    text = line.rstrip().split("\t")
+    accept = '0'
    if len(text) > 1:
        if text[0] and text[1]:
            if is_accepted(text[0],srcaccept,srcreject):
                if is_accepted(text[1],trgaccept,trgreject):
+                    accept = '1'
+                    if not args.print_flag:
                        print(text[0] + "\t" + text[1])
-
+    if args.print_flag:
+        print(accept)