fixed tatoeba model scripts

2024-09-11 20:27:19 +03:00 · 2020-06-08 17:24:39 +03:00 · 2020-06-08 17:24:39 +03:00 · 035cca7c1a
commit 035cca7c1a
parent e07eb14984
3 changed files with 36 additions and 25 deletions
--- a/lib/data.mk
+++ b/lib/data.mk
@ -265,7 +265,7 @@ ${TRAIN_ALG}: 	${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \

 ## TODO: this causes to frequently redo the same data over and over again, does it?
 ##
-# .INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
+.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq

 ifeq (${USE_REST_DEVDATA},1)
  LOCAL_TRAINDATA_DEPENDENCIES = ${DEV_SRC} ${DEV_TRG}
--- a/lib/models/tatoeba.mk
+++ b/lib/models/tatoeba.mk
@ -71,7 +71,8 @@ tatoeba-train:
 tatoeba-eval:
 	${MAKE} compare-tatoeba

-tatoeba-step0: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
+tatoeba-step0: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
+tatoeba-step1: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz


 ## run all language pairs for a given subset
@ -129,8 +130,8 @@ tatoeba-%.md:
 		TESTSIZE=10000 \
 		DEVMINSIZE=200 \
 		WORKHOME=${TATOEBA_WORK} \
-		SRCLANGS="${shell cat $<}" \
-		TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels)}" \
+		SRCLANGS="${shell cat $<  | sed 's/ *$$//'}" \
+		TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels)  | sed 's/ *$$//'}" \
 		LANGPAIRSTR=${LANGPAIRSTR} \
 		EMAIL= \
 	${@:-tatoeba=}
@ -141,32 +142,34 @@ ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.lab
 	  for t in ${TRGLANGS}; do \
 	    if [ "$$s" \< "$$t" ]; then \
 	      ${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
-		${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz; \
+		${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$s.gz; \
 	    fi \
 	  done \
 	done
-	if [ ! -e $@ ]; then \
-	  for s in ${SRCLANGS}; do \
+	for s in ${SRCLANGS}; do \
 	    for t in ${TRGLANGS}; do \
-	      if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels ]; then \
-		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels \
+	      if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$s.labels ]; then \
+		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$s.labels \
+		>> $@.src; \
+	      elif [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$s.labels ]; then \
+		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$s.labels \
 		>> $@.src; \
 	      fi \
 	    done \
-	  done \
-	fi
-	if [ ! -e $(@:.${SRCEXT}.labels=.${TRGEXT}.labels) ]; then \
-	  for s in ${SRCLANGS}; do \
+	done
+	for s in ${SRCLANGS}; do \
 	    for t in ${TRGLANGS}; do \
-	      if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels ]; then \
-		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels \
+	      if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$t.labels ]; then \
+		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$s-$$t.clean.$$t.labels \
+		>> $@.trg; \
+	      elif [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$t.labels ]; then \
+		cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.$$t-$$s.clean.$$t.labels \
 		>> $@.trg; \
 	      fi \
 	    done \
-	  done \
-	fi
-	cat $@.src | tr ' ' "\n" | sort -u | tr "\n" ' ' > $@
-	cat $@.trg | tr ' ' "\n" | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.labels=.${TRGEXT}.labels)
+	done
+	cat $@.src | tr ' ' "\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//' > $@
+	cat $@.trg | tr ' ' "\n" | sort -u | tr "\n" ' ' | sed 's/ *$$//' > $(@:.${SRCEXT}.labels=.${TRGEXT}.labels)
 	rm -f $@.src $@.trg


@ -242,6 +245,13 @@ FIXLANGIDS = 	| sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/
 #######################################
 # make data sets for individual 
 # language pairs from the Tatoeba data
+# TODO: now we only grep for langpairs 
+#       available in test data
+# --> should we also include other 
+#     training data with a dummy label?
+# --> how do we efficiently grep for 
+#     everything that is not one of the langpairs?
+#     grep -v and a big list of alternative lang-pairs ...
 #######################################
 	for s in `cat $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)`; do \
 	  for t in `cat $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)`; do \
--- a/lib/sentencepiece.mk
+++ b/lib/sentencepiece.mk
@ -197,15 +197,16 @@ endif
 #	awk '!/^$$/{a[$$0]++}END{for (i in a)print i,a[i];}' > $@

 ## python-based char-counter (seems to be the fastest version)
+## restrict to 1 million lines
 %.charfreq: %
-	head -10000000 $< > $<.10m
-	-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
-	rm -f $<.10m
+	head -1000000 $< > $<.1m
+	-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.1m', 'r').read())))" > $@
+	rm -f $<.1m

 %.charfreq: %.gz
-	${GZIP} -cd < $< | head -10000000 > $<.10m
-	-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
-	rm -f $<.10m
+	${GZIP} -cd < $< | head -1000000 > $<.1m
+	-python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.1m', 'r').read())))" > $@
+	rm -f $<.1m


 ## slow version