fit-data-size fixed

This commit is contained in:
Joerg Tiedemann 2020-06-08 14:14:55 +03:00
parent 6cb9959e82
commit e07eb14984
7 changed files with 152 additions and 67 deletions

View File

@ -2,6 +2,12 @@
# Things to do
## Bugs
* something is wrong with multi-threaded data preparation
* balancing data for multilingual models does not work well with one lang-pair that is tiny
## General settings
* better hyperparameters for low-resource setting (lower batch sizes, smaller vocabularies ...)

View File

@ -27,10 +27,17 @@ BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
# ${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
${BPESRCMODEL}:
${MAKE} ${LOCAL_TRAIN_SRC}
## we keep the dependency on LOCAL_TRAIN_SRC
## to make multi-threaded make calls behave properly
## --> otherwise there can be multiple threads writing to the same file!
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
ifneq (${wildcard $@},)
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
@echo "!!!!!!!! $@ already exists!"
@echo "!!!!!!!! re-use the old one even if there is new training data"
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
else
mkdir -p ${dir $@}
ifeq (${USE_TARGET_LABELS},1)
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} > ${LOCAL_TRAIN_SRC}.text
@ -39,18 +46,22 @@ ifeq (${USE_TARGET_LABELS},1)
else
python3 ${SNMTPATH}/learn_bpe.py -s $(SRCBPESIZE) < ${LOCAL_TRAIN_SRC} > $@
endif
endif
## no labels on the target language side
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
# ${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
${BPETRGMODEL}:
${MAKE} ${LOCAL_TRAIN_TRG}
${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
ifneq (${wildcard $@},)
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
@echo "!!!!!!!! $@ already exists!"
@echo "!!!!!!!! re-use the old one even if there is new training data"
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
else
mkdir -p ${dir $@}
python3 ${SNMTPATH}/learn_bpe.py -s $(TRGBPESIZE) < ${LOCAL_TRAIN_TRG} > $@
endif
#
%.src.bpe${SRCBPESIZE:000=}k: %.src ${BPESRCMODEL}
ifeq (${USE_TARGET_LABELS},1)
cut -f1 -d ' ' $< > $<.labels

View File

@ -29,6 +29,17 @@ TRGLANGS ?= fi
SRC ?= ${firstword ${SRCLANGS}}
TRG ?= ${lastword ${TRGLANGS}}
## SKIP_LANGPAIRS can be used to skip certain language pairs
## in data preparation for multilingual models
## ---> this can be good to skip BIG language pairs
## that would very much dominate all the data
## must be a pattern that can be matched by egrep
## e.g. en-de|en-fr
SKIP_LANGPAIRS ?= "nothing"
## set SHUFFLE_DATA if you want to shuffle data for
## each language pair to be added to the training data
## --> especially useful in connection with FIT_DATA_SIZE
@ -43,6 +54,10 @@ TRG ?= ${lastword ${TRGLANGS}}
##
# FIT_DATA_SIZE = 100000
## maximum number of repeating the same data set
## in oversampling
MAX_OVER_SAMPLING ?= 50
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})

View File

@ -1,15 +1,18 @@
# -*-makefile-*-
#
# create data files for taining, validation and testing
#
# - combine all bitexts in TRAINSET
# - add backtranslation, pivoted data if necessary
# - add language labels if necessary (multi-target models)
# - over/under-sampling of training data if necessary (multilingual models)
# - shuffle dev/test data and divide into to disjoint sets
# - reverse data sets for the other translation direction (bilingual models only)
# - run word alignment if necessary (models with guded alignment = transformer-align)
## SKIP_LANGPAIRS can be used to skip certain language pairs
## in data preparation for multilingual models
## ---> this can be good to skip BIG language pairs
## that would very much dominate all the data
## must be a pattern that can be matched by egrep
## e.g. en-de|en-fr
SKIP_LANGPAIRS ?= "nothing"
## training data size (generates count if not in README.md)
TRAINDATA_SIZE = ${shell \
@ -255,27 +258,27 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
fi
## TODO: do we need this?
##
# else \
# touch $@; \
# touch ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
%.${TRGEXT}.raw: %.${SRCEXT}.raw
@echo "done!"
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
## TODO: this causes to frequently redo the same data over and over again, does it?
##
# .INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq
ifeq (${USE_REST_DEVDATA},1)
LOCAL_TRAINDATA_DEPENDENCIES = ${DEV_SRC} ${DEV_TRG}
endif
## add training data for each language combination
## and put it together in local space
${LOCAL_TRAIN_SRC}: ${DEV_SRC} ${DEV_TRG}
${LOCAL_TRAIN_SRC}: ${LOCAL_TRAINDATA_DEPENDENCIES}
mkdir -p ${dir $@}
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
echo "" > ${dir $@}README.md
echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md
echo "" >> ${dir $@}README.md
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
@ -305,8 +308,10 @@ ${LOCAL_TRAIN_TRG}: ${LOCAL_TRAIN_SRC}
## in multilingual data sets
## TODO: introduce under and over-sampling for multilingual data sets ...
add-to-local-train-data:
ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
ifneq (${CLEAN_TRAIN_SRC},)
${MAKE} ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
endif
ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
@if [ `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
@ -338,21 +343,22 @@ ifneq (${wildcard ${CLEAN_TRAIN_SRC}},)
ifeq (${USE_TARGET_LABELS},1)
echo "set target language labels";
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} |\
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.src
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
else
echo "only one target language"
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.src
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_SRC}} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
endif
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.trg
${GZIP} -cd < ${wildcard ${CLEAN_TRAIN_TRG}} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
######################################
# SHUFFLE_DATA is set?
# --> shuffle data for each langpair
# --> do this when FIT_DATA_SIZE is set!
######################################
ifdef SHUFFLE_DATA
paste ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.src
cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.trg
paste ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg |\
${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
rm -f ${LOCAL_TRAIN_SRC}.shuffled
endif
######################################
@ -361,13 +367,15 @@ endif
# --> under/over sampling!
######################################
ifdef FIT_DATA_SIZE
scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_SRC}.src >> ${LOCAL_TRAIN_SRC}
scripts/fit-data-size.pl ${FIT_DATA_SIZE} ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
scripts/fit-data-size.pl -m ${MAX_OVER_SAMPLING} ${FIT_DATA_SIZE} \
${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
else
cat ${LOCAL_TRAIN_SRC}.src >> ${LOCAL_TRAIN_SRC}
cat ${LOCAL_TRAIN_TRG}.trg >> ${LOCAL_TRAIN_TRG}
cat ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src >> ${LOCAL_TRAIN_SRC}
cat ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg >> ${LOCAL_TRAIN_TRG}
endif
rm -f ${LOCAL_TRAIN_SRC}.src ${LOCAL_TRAIN_TRG}.trg
rm -f ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
endif
@ -469,8 +477,8 @@ ${DEV_TRG}: ${DEV_SRC}
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
mkdir -p ${dir ${DEV_SRC}}
ifneq (${wildcard ${CLEAN_DEV_SRC}},)
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
${GZIP} -cd < ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
${GZIP} -cd < ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
${GZIP} -cd < ${CLEAN_DEV_SRC} |\

View File

@ -61,8 +61,7 @@ ifneq (${SRCLANGS},${TRGLANGS})
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-job-tatoeba
endif
tatoeba-prepare:
${MAKE} clean-data-tatoeba
tatoeba-prepare: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba
@ -72,6 +71,8 @@ tatoeba-train:
tatoeba-eval:
${MAKE} compare-tatoeba
tatoeba-step0: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
## run all language pairs for a given subset
tatoeba-subset-%: tatoeba-%.md
@ -111,16 +112,12 @@ tatoeba-%.md:
tttt:
echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
echo ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
## generic target for tatoeba challenge jobs
# %-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
# %-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
# ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
# %-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
%-tatoeba: ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels
${MAKE} TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
TESTSET=Tatoeba-test \
@ -132,8 +129,8 @@ tttt:
TESTSIZE=10000 \
DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \
SRCLANGS="${shell cat $(word 1,$^)}" \
TRGLANGS="${shell cat $(word 2,$^)}" \
SRCLANGS="${shell cat $<}" \
TRGLANGS="${shell cat $(<:.${SRCEXT}.labels=.${TRGEXT}.labels)}" \
LANGPAIRSTR=${LANGPAIRSTR} \
EMAIL= \
${@:-tatoeba=}
@ -153,7 +150,7 @@ ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.lab
for t in ${TRGLANGS}; do \
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels ]; then \
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.labels \
>> $@; \
>> $@.src; \
fi \
done \
done \
@ -163,11 +160,16 @@ ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.lab
for t in ${TRGLANGS}; do \
if [ -e ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels ]; then \
cat ${PWD}/work-tatoeba/data/${PRE}/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.labels \
>> $(@:.${SRCEXT}.labels=.${TRGEXT}.labels); \
>> $@.trg; \
fi \
done \
done \
fi
cat $@.src | tr ' ' "\n" | sort -u | tr "\n" ' ' > $@
cat $@.trg | tr ' ' "\n" | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.labels=.${TRGEXT}.labels)
rm -f $@.src $@.trg
%.${LANGPAIRSTR}.clean.${SRCEXT}.labels: %.${LANGPAIRSTR}.clean.${SRCEXT}.labels
echo "done"
@ -208,9 +210,9 @@ FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/
mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}
mv $@.d/data/${LANGPAIR}/test.id ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id
if [ -e $@.d/data/${LANGPAIR}/dev.src ]; then \
mv $@.d/data/${LANGPAIR}/dev.src > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
mv $@.d/data/${LANGPAIR}/dev.trg > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
mv $@.d/data/${LANGPAIR}/dev.id > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
mv $@.d/data/${LANGPAIR}/dev.src ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
mv $@.d/data/${LANGPAIR}/dev.trg ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
mv $@.d/data/${LANGPAIR}/dev.id ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
@ -223,8 +225,15 @@ FIXLANGIDS = | sed 's/zho\(\)_HK/yue\1/;s/zho\(\)_CN/cmn\1/;s/zho\(\)_TW/cmn\1/
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
fi
cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
#######################################
# labels in the data
# TODO: should we take all in all data sets?
# NOW: only look for the ones in test data
#######################################
# cut -f1 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
# cut -f2 ${dir $@}Tatoeba-*.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
cut -f1 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${SRCEXT}.labels)
cut -f2 ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id | sort -u | tr "\n" ' ' > $(@:.${SRCEXT}.gz=.${TRGEXT}.labels)
rm -f $@.d/data/${LANGPAIR}/*
rmdir $@.d/data/${LANGPAIR}
rmdir $@.d/data

View File

@ -1,5 +1,17 @@
# -*-makefile-*-
#
# create sentence piece models
#
# - create models from each part of a bitext
# - individual models for each language in each language pair
# - do not create new models if the data changes
# ---> models need to use the same segmentation/vocab
#
# TODO: should we do that for monolingual files instead
# for creating them from the bilingual data only?
# ---> could use more data
# ---> don't need to re-create models for each language pair
#
##----------------------------------------------
@ -21,11 +33,21 @@ SPMEXTRA =
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
## set to 1 if you want to generate SPM vocab file
GENERATE_SPM_VOC = 0
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMSRCMODEL}:
${MAKE} ${LOCAL_TRAIN_SRC}
## we keep the dependency on LOCAL_TRAIN_SRC
## to make multi-threaded make calls behave properly
## --> otherwise there can be multiple threads writing to the same file!
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
ifneq (${wildcard $@},)
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
@echo "!!!!!!!! $@ already exists!"
@echo "!!!!!!!! re-use the old one even if there is new training data"
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
else
mkdir -p ${dir $@}
ifeq (${USE_TARGET_LABELS},1)
cut -f2- -d ' ' ${LOCAL_TRAIN_SRC} | grep . | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.text
@ -47,12 +69,17 @@ ifeq (${GENERATE_SPM_VOC},1)
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < ${LOCAL_TRAIN_SRC}.text > $@.voc
endif
rm -f ${LOCAL_TRAIN_SRC}.text
endif
## no labels on the target language side
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMTRGMODEL}:
${MAKE} ${LOCAL_TRAIN_TRG}
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
ifneq (${wildcard $@},)
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
@echo "!!!!!!!! $@ already exists!"
@echo "!!!!!!!! re-use the old one even if there is new training data"
@echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
else
mkdir -p ${dir $@}
grep . ${LOCAL_TRAIN_TRG} | ${SHUFFLE} > ${LOCAL_TRAIN_TRG}.text
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
@ -70,7 +97,7 @@ ifeq (${GENERATE_SPM_VOC},1)
${SPM_HOME}/spm_encode --model=$@ --generate_vocabulary < ${LOCAL_TRAIN_TRG}.text > $@.voc
endif
rm -f ${LOCAL_TRAIN_TRG}.text
endif

View File

@ -4,11 +4,17 @@
# in number of lines
use strict;
use Getopt::Std;
use vars qw/$opt_m/;
getopts('m:');
my $size = shift(@ARGV);
my $file = shift(@ARGV);
my $count=0;
my $repeated=0;
while ($count < $size){
open F,"<$file" || die "cannot read from $file!\n";
while (<F>){
@ -16,4 +22,7 @@ while ($count < $size){
print;
last if ($count >= $size);
}
close F;
$repeated++;
last if ($opt_m && $repeated > $opt_m);
}