make compatible with mac osx and include submodules for required tools

This commit is contained in:
Tiedemann 2020-09-02 15:52:34 +03:00
parent cde8e65a5b
commit 2332732577
23 changed files with 199 additions and 121 deletions

24
.gitmodules vendored Normal file
View File

@ -0,0 +1,24 @@
[submodule "tools/pigz"]
path = tools/pigz
url = https://github.com/madler/pigz.git
[submodule "tools/terashuf"]
path = tools/terashuf
url = https://github.com/alexandres/terashuf.git
[submodule "tools/marian-dev"]
path = tools/marian-dev
url = https://github.com/marian-nmt/marian-dev.git
[submodule "tools/OpusTools-perl"]
path = tools/OpusTools-perl
url = https://github.com/Helsinki-NLP/OpusTools-perl.git
[submodule "tools/LanguageCodes"]
path = tools/LanguageCodes
url = https://github.com/Helsinki-NLP/LanguageCodes.git
[submodule "tools/fast_align"]
path = tools/fast_align
url = https://github.com/clab/fast_align.git
[submodule "tools/eflomal"]
path = tools/eflomal
url = https://github.com/robertostling/eflomal
[submodule "tools/moses-scripts"]
path = tools/moses-scripts
url = https://github.com/marian-nmt/moses-scripts.git

View File

@ -574,7 +574,7 @@ ${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
zcat $< |\
${GZCAT} $< |\
grep -v '[<>{}]' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
@ -590,7 +590,7 @@ endif
${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
zcat $< |\
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
sed 's/^>>[a-z]*<< //' |\
@ -606,7 +606,7 @@ ALL_LATEST = ${wildcard */latest/*.gz}
fix-prefix:
for d in ${ALL_LATEST}; do \
echo "fix $$d"; \
zcat $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
${ZCAT} $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
cat $$d.fixed | gzip -c > $$d; \
rm -f $$d.fixed; \
done
@ -688,7 +688,7 @@ SENTSPLITTER ?= sed 's/^ *$$/\# newpar/' | \
${WIKI_TXT}: ${WIKI_JSON}
mkdir -p ${dir $@}
${LOAD_MODULES} \
zcat $< | jq -r '.text' | \
${GZCAT} $< | jq -r '.text' | \
grep -v 'null' |\
grep -v '[<>{}]' |\
${SENTSPLITTER} |\
@ -708,7 +708,7 @@ ${WIKI_TXT}: ${WIKI_JSON}
${WIKI_DOC}: ${WIKI_JSON}
mkdir -p ${dir $@}
${LOAD_MODULES} \
zcat $< | jq -r '.text' | \
${GZCAT} $< | jq -r '.text' | \
sed 's/^ *null *$$//' |\
grep -v '[<>{}]' |\
${SENTSPLITTER} |\
@ -729,9 +729,9 @@ check-length:
for S in `ls $$d/*.$$s.gz`; do \
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
echo "$$S -- $$T"; \
zcat $$S | wc -l; \
zcat $$T | wc -l; \
if [ `zcat $$S | wc -l` != `zcat $$T | wc -l` ]; then \
${GZCAT} $$S | wc -l; \
${GZCAT} $$T | wc -l; \
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
echo "$$S != $$T"; \
fi \
done \
@ -754,7 +754,7 @@ check-length:
#
# ${WIKI_TXT}: ${WIKI_JSON}
# ${LOAD_MODULES} \
# zcat $< | jq -r '.text' | \
# ${ZCAT} $< | jq -r '.text' | \
# grep -v 'null' |\
# ${SENTSPLITTER} |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\

View File

@ -78,7 +78,7 @@ endif
prepare-data: ${TEST_PRE}
${TEST_PRE}: ${TESTSET} ${WORKDIR}/model/decoder.yml
zcat $< | ${PREPROCESS} > $@
${ZCAT} $< | ${PREPROCESS} > $@
translate: ${TEST_TRANS}
@ -98,13 +98,13 @@ ${TEST_TRANS}: ${TEST_PRE} ${WORKDIR}/model/decoder.yml
eval: ${TEST_EVAL}
${TEST_EVAL}: ${TEST_TRANS}
zcat ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} > $@.ref
${ZCAT} ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} > $@.ref
cat $< | sacrebleu $@.ref > $@
cat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref
${TEST_EVALNORM}: ${TEST_TRANS}
zcat ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} |\
${ZCAT} ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} |\
${TOKENIZER}/replace-unicode-punctuation.perl |\
${TOKENIZER}/remove-non-printing-char.perl |\
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\

View File

@ -223,20 +223,20 @@ goethe2-defi:
## without reference normalisation
goethe-other:
zcat de-fi/goethe/test/goethe-institute-test1.fi.gz > $@.ref
${GZCAT} de-fi/goethe/test/goethe-institute-test1.fi.gz > $@.ref
for s in systran yandex google; do \
cat ${HOME}/research/GoetheInstitute/data/test_de_oaversetted_van_$$s.txt |\
gzip -c > de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz; \
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval; \
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
>> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval; \
done
rm -f $@.ref
## with reference normalisation (should not do this)
goethe-other-norm:
zcat de-fi/goethe/test/goethe-institute-test1.fi.gz |\
${GZCAT} de-fi/goethe/test/goethe-institute-test1.fi.gz |\
${TOKENIZER}/replace-unicode-punctuation.perl |\
${TOKENIZER}/remove-non-printing-char.perl |\
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
@ -247,9 +247,9 @@ goethe-other-norm:
${TOKENIZER}/remove-non-printing-char.perl |\
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz; \
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval-norm; \
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
>> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval-norm; \
done
rm -f $@.ref
@ -317,11 +317,11 @@ waen: wa-en/opus/train/opus.wa.gz \
wa-en/opus/train/opus.wa.gz: ../work/wa-en/train/opus.src.clean.spm32k.gz
mkdir -p ${dir $@}
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
wa-en/opus/train/opus.en.gz: ../work/wa-en/train/opus.trg.clean.spm32k.gz
mkdir -p ${dir $@}
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
wa-en/opus/dev/Tatoeba.wa.gz: ../work/wa-en/val/Tatoeba.src
mkdir -p ${dir $@}
@ -353,11 +353,11 @@ enwa: en-wa/opus/train/opus.wa.gz \
en-wa/opus/train/opus.en.gz: ../work/en-wa/train/opus.src.clean.spm32k.gz
mkdir -p ${dir $@}
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
en-wa/opus/train/opus.wa.gz: ../work/en-wa/train/opus.trg.clean.spm32k.gz
mkdir -p ${dir $@}
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
en-wa/opus/dev/Tatoeba.en.gz: ../work/en-wa/val/Tatoeba.src
mkdir -p ${dir $@}
@ -468,8 +468,8 @@ ifneq (${words ${NEWS_ALLSETS_SRC}},2)
cp ${NEWS_TESTSET_TRG} ${LANGPAIRSTR}/news/test/
cp ${NEWS_DEVSET_SRC} ${LANGPAIRSTR}/news/dev/
cp ${NEWS_DEVSET_TRG} ${LANGPAIRSTR}/news/dev/
zcat ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIRSTR}/news/train/news.${SRC}.gz
zcat ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIRSTR}/news/train/news.${TRG}.gz
${ZCAT} ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIRSTR}/news/train/news.${SRC}.gz
${ZCAT} ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIRSTR}/news/train/news.${TRG}.gz
endif
endif
endif
@ -498,13 +498,13 @@ TRGPRE_PARA = ${TRG} ${LANGPAIRSTR}/${BASEMODELNAME}/target.spm
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIRSTR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${GZCAT} $< |\
${LANGPAIRSTR}/${BASEMODELNAME}/preprocess.sh ${SRCPRE_PARA} |\
gzip -c > $@
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIRSTR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${GZCAT} $< |\
${LANGPAIRSTR}/${BASEMODELNAME}/preprocess.sh ${TRGPRE_PARA} |\
gzip -c > $@
@ -599,20 +599,20 @@ eval-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval
## without reference normalisation
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
zcat ${TEST_TRG}.gz > $@.ref
zcat $< | sacrebleu $@.ref > $@
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
${ZCAT} ${TEST_TRG}.gz > $@.ref
${ZCAT} $< | sacrebleu $@.ref > $@
${ZCAT} $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref
## with reference normalisation (should not do this)
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval-norm ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
zcat ${TEST_TRG}.gz |\
${ZCAT} ${TEST_TRG}.gz |\
${TOKENIZER}/replace-unicode-punctuation.perl |\
${TOKENIZER}/remove-non-printing-char.perl |\
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@.ref
zcat $< | sacrebleu $@.ref > $@
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
${ZCAT} $< | sacrebleu $@.ref > $@
${ZCAT} $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref
@ -623,9 +623,9 @@ compare: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
compare-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare: %.compare: %.eval
zcat ${TEST_SRC}.gz > $@.1
zcat ${TEST_TRG}.gz > $@.2
zcat ${<:.eval=.gz} > $@.3
${ZCAT} ${TEST_SRC}.gz > $@.1
${ZCAT} ${TEST_TRG}.gz > $@.2
${ZCAT} ${<:.eval=.gz} > $@.3
paste -d "\n" $@.1 $@.2 $@.3 |\
sed -e "s/&apos;/'/g" \
-e 's/&quot;/"/g' \

View File

@ -331,7 +331,8 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
MODEL_SUBDIR =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODELTYPE = transformer-align
# MODELTYPE = transformer-align
MODELTYPE = transformer
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
@ -393,15 +394,12 @@ endif
## check whether we have GPUs available
## if not: use CPU mode for decoding
NVIDIA_SMI := ${shell which nvidia-smi 2>/dev/null}
ifneq ($(wildcard ${NVIDIA_SMI}),)
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
MARIAN = ${MARIANCPU}
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
endif
else
MARIAN = ${MARIANCPU}
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
endif
@ -447,7 +445,7 @@ ${WORKDIR}/config.mk:
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq; \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
s=`${ZCAT} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
S=`cat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq | wc -l`; \
T=`cat ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq | wc -l`; \
else \

View File

@ -324,7 +324,7 @@ endif
add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
ifdef CHECK_TRAINDATA_SIZE
@if [ `zcat ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `zcat ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
@if [ `${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
@ -353,13 +353,13 @@ endif
######################################
ifeq (${USE_TARGET_LABELS},1)
echo "set target language labels";
zcat ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
else
echo "only one target language"
zcat ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
endif
zcat ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
######################################
# SHUFFLE_DATA is set?
# --> shuffle data for each langpair
@ -491,16 +491,16 @@ ${DEV_TRG}: ${DEV_SRC}
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
mkdir -p ${dir ${DEV_SRC}}
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
zcat ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
${ZCAT} ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
zcat ${CLEAN_DEV_SRC} |\
${ZCAT} ${CLEAN_DEV_SRC} |\
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
else
echo "only one target language"
zcat ${CLEAN_DEV_SRC} >> ${DEV_SRC}
${ZCAT} ${CLEAN_DEV_SRC} >> ${DEV_SRC}
endif
zcat ${CLEAN_DEV_TRG} >> ${DEV_TRG}
${ZCAT} ${CLEAN_DEV_TRG} >> ${DEV_TRG}
####################
@ -563,13 +563,13 @@ add-to-test-data: ${CLEAN_TEST_SRC}
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
ifeq (${USE_TARGET_LABELS},1)
echo "more than one target language";
zcat ${CLEAN_TEST_SRC} |\
${ZCAT} ${CLEAN_TEST_SRC} |\
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
else
echo "only one target language"
zcat ${CLEAN_TEST_SRC} >> ${TEST_SRC}
${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}
endif
zcat ${CLEAN_TEST_TRG} >> ${TEST_TRG}
${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}

View File

@ -46,31 +46,35 @@ GPU = p100
DEVICE = cuda
LOADCPU = module load ${CPU_MODULES}
LOADGPU = module load ${GPU_MODULES}
LOADMODS = echo "nothing to load"
WORKHOME = ${PWD}/work
ifeq (${shell hostname},dx6-ibs-p2)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MOSESSCRIPTS = ${MOSESHOME}/scripts
MARIAN_HOME = ${APPLHOME}/marian/build/
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
else ifeq (${shell hostname},dx7-nkiel-4gpu)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MOSESSCRIPTS = ${MOSESHOME}/scripts
MARIAN_HOME = ${APPLHOME}/marian/build/
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
else ifneq ($(wildcard /wrk/tiedeman/research),)
APPLHOME = /proj/memad/tools
WORKHOME = /wrk/tiedeman/research/Opus-MT/work
OPUSHOME = /proj/nlpl/data/OPUS
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
MOSESSCRIPTS = ${MOSESHOME}/scripts
MARIAN_HOME = ${HOME}/appl_taito/tools/marian/build-gpu/
MARIAN = ${HOME}/appl_taito/tools/marian/build-gpu
MARIANCPU = ${HOME}/appl_taito/tools/marian/build-cpu
LOADMODS = ${LOADGPU}
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
CSCPROJECT = project_2002688
@ -78,26 +82,41 @@ else ifeq (${shell hostname --domain 2>/dev/null},bullx)
APPLHOME = /projappl/project_2001194
OPUSHOME = /projappl/nlpl/data/OPUS
MOSESHOME = ${APPLHOME}/mosesdecoder
MOSESSCRIPTS = ${MOSESHOME}/scripts
EFLOMAL_HOME = ${APPLHOME}/eflomal/
MARIAN_HOME = ${APPLHOME}/marian-dev/build/
MARIAN = ${APPLHOME}/marian-dev/build
MARIANCPU = ${APPLHOME}/marian-dev/build
SPM_HOME = ${MARIAN_HOME}
GPU = v100
GPU_MODULES = python-env
CPU_MODULES = python-env
LOADMODS = echo "nothing to load"
HPC_QUEUE = small
export PATH := ${APPLHOME}/bin:${PATH}
endif
ifdef LOCAL_SCRATCH
TMPDIR = ${LOCAL_SCRATCH}
endif
## tools and their locations
SCRIPTDIR ?= ${PWD}/scripts
ISO639 ?= ${shell which iso639 || echo 'perl ${PWD}/tools/LanguageCodes/ISO-639-3/bin/iso639'}
PIGZ ?= ${shell which pigz || echo ${PWD}/tools/pigz/pigz}
TERASHUF ?= ${shell which terashuf || echo ${PWD}/tools/terashuf/terashuf}
MARIAN ?= ${shell which marian || echo ${PWD}/tools/marian-dev/build/marian}
MARIAN_HOME ?= $(dir ${MARIAN})
SPM_HOME ?= ${dir ${MARIAN}}
FASTALIGN ?= ${shell which fast_align || echo ${PWD}/tools/fast_align/build/fast_align}
FASTALIGN_HOME ?= ${dir ${FASTALIGN}}
ATOOLS ?= ${FASTALIGN_HOME}atools
WORDALIGN ?= ${EFLOMAL_HOME}align.py
MOSESSCRIPTS ?= ${PWD}/tools/moses-scripts/scripts
## marian-nmt binaries
MARIAN_TRAIN = ${MARIAN_HOME}marian
@ -105,13 +124,7 @@ MARIAN_DECODER = ${MARIAN_HOME}marian-decoder
MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
## other tools and their locations
SCRIPTDIR = ${PWD}/scripts
WORDALIGN = ${EFLOMAL_HOME}align.py
ATOOLS = ${FASTALIGN_HOME}atools
MOSESSCRIPTS = ${MOSESHOME}/scripts
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
SNMTPATH = ${APPLHOME}/subword-nmt/subword_nmt
@ -120,14 +133,15 @@ SPM_TRAIN = ${SPM_HOME}spm_train
SPM_ENCODE = ${SPM_HOME}spm_encode
SORT = sort -T ${TMPDIR} --parallel=${THREADS}
SHUFFLE = ${shell which terashuf 2>/dev/null}
ifeq (${SHUFFLE},)
SHUFFLE = ${SORT} --random-sort
endif
GZIP := ${shell which pigz 2>/dev/null}
GZIP ?= gzip
ZCAT = ${GZIP} -cd <
SORT := sort -T ${TMPDIR} --parallel=${THREADS}
SHUFFLE := ${shell which ${TERASHUF} || echo "${SORT} --random-sort"}
GZIP := ${shell which ${PIGZ} || echo gzip}
GZCAT := ${GZIP} -cd
ZCAT := gzip -cd
# TODO: delete those?
@ -143,10 +157,50 @@ MULTEVALHOME = ${APPLHOME}/multeval
## * marian-nmt
PIP := ${shell which pip3 2>/dev/null}
PIP ?= pip
PREREQ_TOOLS := ${ISO639} ${ATOOLS} ${PIGZ} ${TERASHUF} ${MARIAN}
PIP := ${shell which pip3 2>/dev/null || echo pip}
CPAN := ${shell which cpanm 2>/dev/null || echo cpan}
NVIDIA_SMI := ${shell which nvidia-smi 2>/dev/null}
ifneq ($(wildcard ${NVIDIA_SMI}),)
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
MARIAN_BUILD_OPTIONS=-DCOMPILE_CUDA=off
endif
else
MARIAN_BUILD_OPTIONS=-DCOMPILE_CUDA=off
endif
PHONY: install-prerequisites install-prereq install-requirements
install-prerequisites install-prereq install-requirements:
${PIP} install --user -r requirements.txt
${MAKE} ${PREREQ_TOOLS}
tools/LanguageCodes/ISO-639-3/bin/iso639:
${MAKE} tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm
tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm:
${MAKE} -C tools/LanguageCodes all
tools/fast_align/build/atools:
mkdir -p ${dir $@}
cd ${dir $@} && cmake ..
${MAKE} -C ${dir $@}
tools/pigz/pigz:
${MAKE} -C ${dir $@}
tools/terashuf/terashuf:
${MAKE} -C ${dir $@}
## For Mac users: install protobuf
##
## sudo port install protobuf3-cpp
tools/marian-dev/build/marian:
mkdir -p ${dir $@}
cd ${dir $@} && cmake -DUSE_SENTENCEPIECE=on ${MARIAN_BUILD_OPTIONS} ..
${MAKE} -C ${dir $@} -j

View File

@ -75,7 +75,7 @@ all2pivot:
train-dynamic:
if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
${MAKE} data; \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
s=`${ZCAT} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
if [ $$s -gt 10000000 ]; then \
echo "${LANGPAIRSTR} bigger than 10 million"; \
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
@ -300,8 +300,7 @@ endif
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
%-cpu:
${MAKE} MARIAN=${MARIANCPU} \
LOADMODS='${LOADCPU}' \
${MAKE} LOADMODS='${LOADCPU}' \
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
${@:-cpu=}

View File

@ -7,7 +7,7 @@ ALL_DATA_SETS = ${patsubst %.${SRCEXT}.gz,%,${CLEAN_TRAIN_SRC}}
check-bitext-length:
for d in ${ALL_DATA_SETS}; do \
if [ `zcat $$d.${SRCEXT}.gz | wc -l` != `zcat $$d.${TRGEXT}.gz | wc -l` ]; then \
if [ `${ZCAT} $$d.${SRCEXT}.gz | wc -l` != `${ZCAT} $$d.${TRGEXT}.gz | wc -l` ]; then \
echo "not the same number of lines in $$d"; \
fi \
done

View File

@ -118,7 +118,6 @@ TATOEBA_PARAMS := TRAINSET=Tatoeba-train \
ISO639 := iso639
GET_ISO_CODE := ${ISO639} -m
## taken from the Tatoeba-Challenge Makefile
@ -577,7 +576,7 @@ tatoeba-trainsize-%.txt: tatoeba-%.md
s=`echo $$l | cut -f1 -d '-'`; \
t=`echo $$l | cut -f2 -d '-'`; \
echo -n "$$l " >> $@; \
zcat ${TATOEBA_DATA}/Tatoeba-train.$$l.clean.$$s.gz | wc -l >> $@; \
${GZCAT} ${TATOEBA_DATA}/Tatoeba-train.$$l.clean.$$s.gz | wc -l >> $@; \
done
@ -806,7 +805,7 @@ ${TATOEBA_MONO}/%.labels:
wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar
find $@.d -name '*.id.gz' | xargs zcat | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
for c in `find $@.d -name '*.id.gz' | sed 's/\.id\.gz//'`; do \
echo "extract all data from $$c.txt.gz"; \
${GZIP} -d $$c.id.gz; \
@ -845,19 +844,19 @@ ${TATOEBA_MONO}/%.labels:
mv $@.d/data/${LANGPAIR}/dev.trg ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
cat $@.d/data/${LANGPAIR}/dev.id $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
if [ -e $@.d/data/${LANGPAIR}/train.src.gz ]; then \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
${GZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${GZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${GZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
fi; \
else \
if [ -e $@.d/data/${LANGPAIR}/train.src.gz ]; then \
echo "no devdata available - get top 1000 from training data!"; \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
${GZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
${GZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
${GZCAT} $@.d/data/${LANGPAIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
${GZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
${GZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
${GZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
fi \
fi
## make sure that training data file exists even if it is empty

View File

@ -26,13 +26,13 @@ endif
${DATADIR}/${PRE}/WikiMatrix.${WIKIMATRIX_SCORE}.${LANGPAIR}.${SRCEXT}.raw:
mkdir -p ${dir $@}
zcat ${WIKIMATRIX_DATA} | \
${ZCAT} ${WIKIMATRIX_DATA} | \
awk '{if($$1>${WIKIMATRIX_SCORE})print}' | \
cut -f${WIKIMATRIX_SRCFIELD} > $@
${DATADIR}/${PRE}/WikiMatrix.${WIKIMATRIX_SCORE}.${LANGPAIR}.${TRGEXT}.raw:
mkdir -p ${dir $@}
zcat ${WIKIMATRIX_DATA} | \
${ZCAT} ${WIKIMATRIX_DATA} | \
awk '{if($$1>${WIKIMATRIX_SCORE})print}' | \
cut -f${WIKIMATRIX_TRGFIELD} > $@

View File

@ -25,8 +25,8 @@
cut -f1 $@.bitext | ${GZIP} -c > $@
cut -f2 $@.bitext | ${GZIP} -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
rm -f $@.bitext $@.1 $@.2
if [ ! `zcat "$@" | head | wc -l` -gt 0 ]; then rm -f $@; fi
if [ ! `zcat "$(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)" | head | wc -l` -gt 0 ]; then \
if [ ! `${ZCAT} "$@" | head | wc -l` -gt 0 ]; then rm -f $@; fi
if [ ! `${ZCAT} "$(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)" | head | wc -l` -gt 0 ]; then \
rm -f $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz); \
fi
@ -46,21 +46,21 @@
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.zh_cn.tok: %.zh_cn.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.zh.tok: %.zh.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
## generic target for tokenization
%.tok: %.raw
@ -71,7 +71,7 @@
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
$(TOKENIZER)/tokenizer.perl -a -threads $(THREADS) \
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
@ -84,28 +84,28 @@
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
%.norm: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${SRCEXT}.norm: %.${SRCEXT}.raw
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${TRGEXT}.norm: %.${TRGEXT}.raw
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
## minimal pre-processing
@ -114,28 +114,28 @@
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
%.simple: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${SRCEXT}.simple: %.${SRCEXT}.raw
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${TRGEXT}.simple: %.${TRGEXT}.raw
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
@ -145,7 +145,7 @@
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
sed 's/ /▁/g' > $@
@ -172,21 +172,21 @@
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/normalize-punctuation.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
# %.simple.gz: %.gz
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/deescape-special-chars.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
# %.nospace.gz: %.gz
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/deescape-special-chars.perl |\
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
# sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
# sed 's/ /▁/g' |\
# ${GZIP} -c > $@

View File

@ -32,10 +32,9 @@ endif
echo '#SBATCH -n 1' >> $@
echo '#SBATCH -N 1' >> $@
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
ifeq (${shell hostname --domain},bullx)
ifeq (${shell hostname --domain 2>/dev/null},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}' >> $@
# echo '#SBATCH --exclude=r18g02' >> $@
else
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS}' >> $@
endif
@ -70,12 +69,9 @@ ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
endif
ifeq (${shell hostname --domain},bullx)
ifeq (${shell hostname --domain 2>/dev/null},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@
# echo '#SBATCH --exclude=r05c49' >> $@
# echo '#SBATCH --exclude=r07c51' >> $@
# echo '#SBATCH --exclude=r06c50' >> $@
endif
echo '#SBATCH -n ${HPC_CORES}' >> $@
echo '#SBATCH -N ${HPC_NODES}' >> $@

View File

@ -30,7 +30,7 @@ ifneq (${MODEL_LATEST_VOCAB},)
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
else
mkdir -p ${dir $@}
${LOADMODS} && zcat $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
endif
else
@echo "$@ already exists!"

View File

@ -229,7 +229,7 @@ ${TRANSLATED_PRE}: ${ORIGINAL_DATASRC}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${DECODER}
zcat $< |\
${GZCAT} $< |\
head -${MAX_PIVOT_SENTENCES} |\
${OUTPUT_DIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
gzip -c > $@
@ -241,7 +241,7 @@ ${OUTPUT_DIR}/%.${MODELNAME}.${LANGPAIR}.${PIVOT}.spm.gz: ${ORIGINAL_DATADIR}/${
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${DECODER}
zcat $< |\
${GZCAT} $< |\
${OUTPUT_DIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
gzip -c > $@
endif
@ -299,9 +299,9 @@ check-length:
for S in `ls $$d/*.$$s.gz`; do \
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
echo "$$S -- $$T"; \
zcat $$S | wc -l; \
zcat $$T | wc -l; \
if [ `zcat $$S | wc -l` != `zcat $$T | wc -l` ]; then \
${GZCAT} $$S | wc -l; \
${GZCAT} $$T | wc -l; \
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
echo "$$S != $$T"; \
fi \
done \

1
tools/LanguageCodes Submodule

@ -0,0 +1 @@
Subproject commit d19d3ff363c611a23a52db62aecc0d502d2c058d

1
tools/OpusTools-perl Submodule

@ -0,0 +1 @@
Subproject commit 156b0c5119b28b81232a8276e4fa3df04afbc7d0

1
tools/eflomal Submodule

@ -0,0 +1 @@
Subproject commit 7b97f19187c8b1bc1f21aefd77fc1b87575d1c00

1
tools/fast_align Submodule

@ -0,0 +1 @@
Subproject commit cab1e9aac8d3bb02ff5ae58218d8d225a039fa11

1
tools/marian-dev Submodule

@ -0,0 +1 @@
Subproject commit 4d9d15649e83766fd2ee2a79db79e0d8a2fed3c4

1
tools/moses-scripts Submodule

@ -0,0 +1 @@
Subproject commit 958dd5a6b026197de988c0264d45112215bc37bd

1
tools/pigz Submodule

@ -0,0 +1 @@
Subproject commit fe822cb435622c43f491013da77b127e9fe851a9

1
tools/terashuf Submodule

@ -0,0 +1 @@
Subproject commit 6b0a8b0c2614c9af687d2bdb1851db89fa1cbf38