mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
make compatible with mac osx and include submodules for required tools
This commit is contained in:
parent
cde8e65a5b
commit
2332732577
24
.gitmodules
vendored
Normal file
24
.gitmodules
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
[submodule "tools/pigz"]
|
||||
path = tools/pigz
|
||||
url = https://github.com/madler/pigz.git
|
||||
[submodule "tools/terashuf"]
|
||||
path = tools/terashuf
|
||||
url = https://github.com/alexandres/terashuf.git
|
||||
[submodule "tools/marian-dev"]
|
||||
path = tools/marian-dev
|
||||
url = https://github.com/marian-nmt/marian-dev.git
|
||||
[submodule "tools/OpusTools-perl"]
|
||||
path = tools/OpusTools-perl
|
||||
url = https://github.com/Helsinki-NLP/OpusTools-perl.git
|
||||
[submodule "tools/LanguageCodes"]
|
||||
path = tools/LanguageCodes
|
||||
url = https://github.com/Helsinki-NLP/LanguageCodes.git
|
||||
[submodule "tools/fast_align"]
|
||||
path = tools/fast_align
|
||||
url = https://github.com/clab/fast_align.git
|
||||
[submodule "tools/eflomal"]
|
||||
path = tools/eflomal
|
||||
url = https://github.com/robertostling/eflomal
|
||||
[submodule "tools/moses-scripts"]
|
||||
path = tools/moses-scripts
|
||||
url = https://github.com/marian-nmt/moses-scripts.git
|
@ -574,7 +574,7 @@ ${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${GZCAT} $< |\
|
||||
grep -v '[<>{}]' |\
|
||||
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
||||
@ -590,7 +590,7 @@ endif
|
||||
${WIKI_SRC}: ${WIKI_PRE}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
zcat $< |\
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
@ -606,7 +606,7 @@ ALL_LATEST = ${wildcard */latest/*.gz}
|
||||
fix-prefix:
|
||||
for d in ${ALL_LATEST}; do \
|
||||
echo "fix $$d"; \
|
||||
zcat $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
|
||||
${ZCAT} $$d | sed 's/^>>[a-z]*<< //' > $$d.fixed; \
|
||||
cat $$d.fixed | gzip -c > $$d; \
|
||||
rm -f $$d.fixed; \
|
||||
done
|
||||
@ -688,7 +688,7 @@ SENTSPLITTER ?= sed 's/^ *$$/\# newpar/' | \
|
||||
${WIKI_TXT}: ${WIKI_JSON}
|
||||
mkdir -p ${dir $@}
|
||||
${LOAD_MODULES} \
|
||||
zcat $< | jq -r '.text' | \
|
||||
${GZCAT} $< | jq -r '.text' | \
|
||||
grep -v 'null' |\
|
||||
grep -v '[<>{}]' |\
|
||||
${SENTSPLITTER} |\
|
||||
@ -708,7 +708,7 @@ ${WIKI_TXT}: ${WIKI_JSON}
|
||||
${WIKI_DOC}: ${WIKI_JSON}
|
||||
mkdir -p ${dir $@}
|
||||
${LOAD_MODULES} \
|
||||
zcat $< | jq -r '.text' | \
|
||||
${GZCAT} $< | jq -r '.text' | \
|
||||
sed 's/^ *null *$$//' |\
|
||||
grep -v '[<>{}]' |\
|
||||
${SENTSPLITTER} |\
|
||||
@ -729,9 +729,9 @@ check-length:
|
||||
for S in `ls $$d/*.$$s.gz`; do \
|
||||
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
|
||||
echo "$$S -- $$T"; \
|
||||
zcat $$S | wc -l; \
|
||||
zcat $$T | wc -l; \
|
||||
if [ `zcat $$S | wc -l` != `zcat $$T | wc -l` ]; then \
|
||||
${GZCAT} $$S | wc -l; \
|
||||
${GZCAT} $$T | wc -l; \
|
||||
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
|
||||
echo "$$S != $$T"; \
|
||||
fi \
|
||||
done \
|
||||
@ -754,7 +754,7 @@ check-length:
|
||||
#
|
||||
# ${WIKI_TXT}: ${WIKI_JSON}
|
||||
# ${LOAD_MODULES} \
|
||||
# zcat $< | jq -r '.text' | \
|
||||
# ${ZCAT} $< | jq -r '.text' | \
|
||||
# grep -v 'null' |\
|
||||
# ${SENTSPLITTER} |\
|
||||
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
|
@ -78,7 +78,7 @@ endif
|
||||
|
||||
prepare-data: ${TEST_PRE}
|
||||
${TEST_PRE}: ${TESTSET} ${WORKDIR}/model/decoder.yml
|
||||
zcat $< | ${PREPROCESS} > $@
|
||||
${ZCAT} $< | ${PREPROCESS} > $@
|
||||
|
||||
|
||||
translate: ${TEST_TRANS}
|
||||
@ -98,13 +98,13 @@ ${TEST_TRANS}: ${TEST_PRE} ${WORKDIR}/model/decoder.yml
|
||||
eval: ${TEST_EVAL}
|
||||
|
||||
${TEST_EVAL}: ${TEST_TRANS}
|
||||
zcat ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} > $@.ref
|
||||
${ZCAT} ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} > $@.ref
|
||||
cat $< | sacrebleu $@.ref > $@
|
||||
cat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
rm -f $@.ref
|
||||
|
||||
${TEST_EVALNORM}: ${TEST_TRANS}
|
||||
zcat ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} |\
|
||||
${ZCAT} ${patsubst %.${SRC}.gz,%.${TRG}.gz,${TESTSET}} |\
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |\
|
||||
${TOKENIZER}/remove-non-printing-char.perl |\
|
||||
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
|
||||
|
@ -223,20 +223,20 @@ goethe2-defi:
|
||||
|
||||
## without reference normalisation
|
||||
goethe-other:
|
||||
zcat de-fi/goethe/test/goethe-institute-test1.fi.gz > $@.ref
|
||||
${GZCAT} de-fi/goethe/test/goethe-institute-test1.fi.gz > $@.ref
|
||||
for s in systran yandex google; do \
|
||||
cat ${HOME}/research/GoetheInstitute/data/test_de_oaversetted_van_$$s.txt |\
|
||||
gzip -c > de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz; \
|
||||
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
|
||||
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
|
||||
> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval; \
|
||||
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
|
||||
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
|
||||
>> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval; \
|
||||
done
|
||||
rm -f $@.ref
|
||||
|
||||
## with reference normalisation (should not do this)
|
||||
goethe-other-norm:
|
||||
zcat de-fi/goethe/test/goethe-institute-test1.fi.gz |\
|
||||
${GZCAT} de-fi/goethe/test/goethe-institute-test1.fi.gz |\
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |\
|
||||
${TOKENIZER}/remove-non-printing-char.perl |\
|
||||
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
|
||||
@ -247,9 +247,9 @@ goethe-other-norm:
|
||||
${TOKENIZER}/remove-non-printing-char.perl |\
|
||||
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz; \
|
||||
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
|
||||
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu $@.ref \
|
||||
> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval-norm; \
|
||||
zcat de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
|
||||
${GZCAT} de-fi/goethe/test/goethe-institute-test1.de.$$s.de.gz | sacrebleu --metrics=chrf --width=3 $@.ref \
|
||||
>> de-fi/goethe/test/goethe-institute-test1.de.$$s.de.eval-norm; \
|
||||
done
|
||||
rm -f $@.ref
|
||||
@ -317,11 +317,11 @@ waen: wa-en/opus/train/opus.wa.gz \
|
||||
|
||||
wa-en/opus/train/opus.wa.gz: ../work/wa-en/train/opus.src.clean.spm32k.gz
|
||||
mkdir -p ${dir $@}
|
||||
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
|
||||
wa-en/opus/train/opus.en.gz: ../work/wa-en/train/opus.trg.clean.spm32k.gz
|
||||
mkdir -p ${dir $@}
|
||||
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
|
||||
wa-en/opus/dev/Tatoeba.wa.gz: ../work/wa-en/val/Tatoeba.src
|
||||
mkdir -p ${dir $@}
|
||||
@ -353,11 +353,11 @@ enwa: en-wa/opus/train/opus.wa.gz \
|
||||
|
||||
en-wa/opus/train/opus.en.gz: ../work/en-wa/train/opus.src.clean.spm32k.gz
|
||||
mkdir -p ${dir $@}
|
||||
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
|
||||
en-wa/opus/train/opus.wa.gz: ../work/en-wa/train/opus.trg.clean.spm32k.gz
|
||||
mkdir -p ${dir $@}
|
||||
zcat $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
${GZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | gzip -c > $@
|
||||
|
||||
en-wa/opus/dev/Tatoeba.en.gz: ../work/en-wa/val/Tatoeba.src
|
||||
mkdir -p ${dir $@}
|
||||
@ -468,8 +468,8 @@ ifneq (${words ${NEWS_ALLSETS_SRC}},2)
|
||||
cp ${NEWS_TESTSET_TRG} ${LANGPAIRSTR}/news/test/
|
||||
cp ${NEWS_DEVSET_SRC} ${LANGPAIRSTR}/news/dev/
|
||||
cp ${NEWS_DEVSET_TRG} ${LANGPAIRSTR}/news/dev/
|
||||
zcat ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIRSTR}/news/train/news.${SRC}.gz
|
||||
zcat ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIRSTR}/news/train/news.${TRG}.gz
|
||||
${ZCAT} ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIRSTR}/news/train/news.${SRC}.gz
|
||||
${ZCAT} ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIRSTR}/news/train/news.${TRG}.gz
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
@ -498,13 +498,13 @@ TRGPRE_PARA = ${TRG} ${LANGPAIRSTR}/${BASEMODELNAME}/target.spm
|
||||
|
||||
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIRSTR}/${BASEMODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${GZCAT} $< |\
|
||||
${LANGPAIRSTR}/${BASEMODELNAME}/preprocess.sh ${SRCPRE_PARA} |\
|
||||
gzip -c > $@
|
||||
|
||||
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
|
||||
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIRSTR}/${BASEMODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${GZCAT} $< |\
|
||||
${LANGPAIRSTR}/${BASEMODELNAME}/preprocess.sh ${TRGPRE_PARA} |\
|
||||
gzip -c > $@
|
||||
|
||||
@ -599,20 +599,20 @@ eval-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval
|
||||
|
||||
## without reference normalisation
|
||||
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
|
||||
zcat ${TEST_TRG}.gz > $@.ref
|
||||
zcat $< | sacrebleu $@.ref > $@
|
||||
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
${ZCAT} ${TEST_TRG}.gz > $@.ref
|
||||
${ZCAT} $< | sacrebleu $@.ref > $@
|
||||
${ZCAT} $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
rm -f $@.ref
|
||||
|
||||
## with reference normalisation (should not do this)
|
||||
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval-norm ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
|
||||
zcat ${TEST_TRG}.gz |\
|
||||
${ZCAT} ${TEST_TRG}.gz |\
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |\
|
||||
${TOKENIZER}/remove-non-printing-char.perl |\
|
||||
${TOKENIZER}/normalize-punctuation.perl -l ${TRG} |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@.ref
|
||||
zcat $< | sacrebleu $@.ref > $@
|
||||
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
${ZCAT} $< | sacrebleu $@.ref > $@
|
||||
${ZCAT} $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
rm -f $@.ref
|
||||
|
||||
|
||||
@ -623,9 +623,9 @@ compare: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
||||
compare-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
||||
|
||||
${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare: %.compare: %.eval
|
||||
zcat ${TEST_SRC}.gz > $@.1
|
||||
zcat ${TEST_TRG}.gz > $@.2
|
||||
zcat ${<:.eval=.gz} > $@.3
|
||||
${ZCAT} ${TEST_SRC}.gz > $@.1
|
||||
${ZCAT} ${TEST_TRG}.gz > $@.2
|
||||
${ZCAT} ${<:.eval=.gz} > $@.3
|
||||
paste -d "\n" $@.1 $@.2 $@.3 |\
|
||||
sed -e "s/'/'/g" \
|
||||
-e 's/"/"/g' \
|
||||
|
@ -331,7 +331,8 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
|
||||
|
||||
MODEL_SUBDIR =
|
||||
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
|
||||
MODELTYPE = transformer-align
|
||||
# MODELTYPE = transformer-align
|
||||
MODELTYPE = transformer
|
||||
NR = 1
|
||||
|
||||
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
|
||||
@ -393,15 +394,12 @@ endif
|
||||
|
||||
## check whether we have GPUs available
|
||||
## if not: use CPU mode for decoding
|
||||
NVIDIA_SMI := ${shell which nvidia-smi 2>/dev/null}
|
||||
ifneq ($(wildcard ${NVIDIA_SMI}),)
|
||||
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
|
||||
MARIAN = ${MARIANCPU}
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
|
||||
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
|
||||
endif
|
||||
else
|
||||
MARIAN = ${MARIANCPU}
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
|
||||
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
|
||||
endif
|
||||
@ -447,7 +445,7 @@ ${WORKDIR}/config.mk:
|
||||
if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
|
||||
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
|
||||
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq; \
|
||||
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
s=`${ZCAT} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
S=`cat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq | wc -l`; \
|
||||
T=`cat ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq | wc -l`; \
|
||||
else \
|
||||
|
22
lib/data.mk
22
lib/data.mk
@ -324,7 +324,7 @@ endif
|
||||
|
||||
add-to-local-train-data: ${CLEAN_TRAIN_SRC} ${CLEAN_TRAIN_TRG}
|
||||
ifdef CHECK_TRAINDATA_SIZE
|
||||
@if [ `zcat ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `zcat ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
|
||||
@if [ `${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} | wc -l` != `${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} | wc -l` ]; then \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; \
|
||||
@ -353,13 +353,13 @@ endif
|
||||
######################################
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "set target language labels";
|
||||
zcat ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
|
||||
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} |\
|
||||
sed "s/^/>>${TRG}<< /" > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
else
|
||||
echo "only one target language"
|
||||
zcat ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
${ZCAT} ${wildcard ${CLEAN_TRAIN_SRC}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_SRC}.${LANGPAIR}.src
|
||||
endif
|
||||
zcat ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
${ZCAT} ${wildcard ${CLEAN_TRAIN_TRG}} ${CUT_DATA_SETS} > ${LOCAL_TRAIN_TRG}.${LANGPAIR}.trg
|
||||
######################################
|
||||
# SHUFFLE_DATA is set?
|
||||
# --> shuffle data for each langpair
|
||||
@ -491,16 +491,16 @@ ${DEV_TRG}: ${DEV_SRC}
|
||||
add-to-dev-data: ${CLEAN_DEV_SRC} ${CLEAN_DEV_TRG}
|
||||
mkdir -p ${dir ${DEV_SRC}}
|
||||
echo -n "* ${LANGPAIR}: ${DEVSET}, " >> ${dir ${DEV_SRC}}README.md
|
||||
zcat ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
${ZCAT} ${CLEAN_DEV_SRC} | wc -l >> ${dir ${DEV_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "more than one target language";
|
||||
zcat ${CLEAN_DEV_SRC} |\
|
||||
${ZCAT} ${CLEAN_DEV_SRC} |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${DEV_SRC}
|
||||
else
|
||||
echo "only one target language"
|
||||
zcat ${CLEAN_DEV_SRC} >> ${DEV_SRC}
|
||||
${ZCAT} ${CLEAN_DEV_SRC} >> ${DEV_SRC}
|
||||
endif
|
||||
zcat ${CLEAN_DEV_TRG} >> ${DEV_TRG}
|
||||
${ZCAT} ${CLEAN_DEV_TRG} >> ${DEV_TRG}
|
||||
|
||||
|
||||
####################
|
||||
@ -563,13 +563,13 @@ add-to-test-data: ${CLEAN_TEST_SRC}
|
||||
echo "* ${LANGPAIR}: ${TESTSET}" >> ${dir ${TEST_SRC}}README.md
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
echo "more than one target language";
|
||||
zcat ${CLEAN_TEST_SRC} |\
|
||||
${ZCAT} ${CLEAN_TEST_SRC} |\
|
||||
sed "s/^/>>${TRG}<< /" >> ${TEST_SRC}
|
||||
else
|
||||
echo "only one target language"
|
||||
zcat ${CLEAN_TEST_SRC} >> ${TEST_SRC}
|
||||
${ZCAT} ${CLEAN_TEST_SRC} >> ${TEST_SRC}
|
||||
endif
|
||||
zcat ${CLEAN_TEST_TRG} >> ${TEST_TRG}
|
||||
${ZCAT} ${CLEAN_TEST_TRG} >> ${TEST_TRG}
|
||||
|
||||
|
||||
|
||||
|
98
lib/env.mk
98
lib/env.mk
@ -46,31 +46,35 @@ GPU = p100
|
||||
DEVICE = cuda
|
||||
LOADCPU = module load ${CPU_MODULES}
|
||||
LOADGPU = module load ${GPU_MODULES}
|
||||
LOADMODS = echo "nothing to load"
|
||||
|
||||
WORKHOME = ${PWD}/work
|
||||
|
||||
|
||||
ifeq (${shell hostname},dx6-ibs-p2)
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
MARIAN_HOME = ${APPLHOME}/marian/build/
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
LOADMODS = echo "nothing to load"
|
||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
MARIAN_HOME = ${APPLHOME}/marian/build/
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
LOADMODS = echo "nothing to load"
|
||||
else ifneq ($(wildcard /wrk/tiedeman/research),)
|
||||
APPLHOME = /proj/memad/tools
|
||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work
|
||||
OPUSHOME = /proj/nlpl/data/OPUS
|
||||
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
MARIAN_HOME = ${HOME}/appl_taito/tools/marian/build-gpu/
|
||||
MARIAN = ${HOME}/appl_taito/tools/marian/build-gpu
|
||||
MARIANCPU = ${HOME}/appl_taito/tools/marian/build-cpu
|
||||
LOADMODS = ${LOADGPU}
|
||||
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
CSCPROJECT = project_2002688
|
||||
@ -78,26 +82,41 @@ else ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
APPLHOME = /projappl/project_2001194
|
||||
OPUSHOME = /projappl/nlpl/data/OPUS
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
EFLOMAL_HOME = ${APPLHOME}/eflomal/
|
||||
MARIAN_HOME = ${APPLHOME}/marian-dev/build/
|
||||
MARIAN = ${APPLHOME}/marian-dev/build
|
||||
MARIANCPU = ${APPLHOME}/marian-dev/build
|
||||
SPM_HOME = ${MARIAN_HOME}
|
||||
GPU = v100
|
||||
GPU_MODULES = python-env
|
||||
CPU_MODULES = python-env
|
||||
LOADMODS = echo "nothing to load"
|
||||
HPC_QUEUE = small
|
||||
export PATH := ${APPLHOME}/bin:${PATH}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
ifdef LOCAL_SCRATCH
|
||||
TMPDIR = ${LOCAL_SCRATCH}
|
||||
endif
|
||||
|
||||
|
||||
## tools and their locations
|
||||
|
||||
SCRIPTDIR ?= ${PWD}/scripts
|
||||
|
||||
ISO639 ?= ${shell which iso639 || echo 'perl ${PWD}/tools/LanguageCodes/ISO-639-3/bin/iso639'}
|
||||
PIGZ ?= ${shell which pigz || echo ${PWD}/tools/pigz/pigz}
|
||||
TERASHUF ?= ${shell which terashuf || echo ${PWD}/tools/terashuf/terashuf}
|
||||
MARIAN ?= ${shell which marian || echo ${PWD}/tools/marian-dev/build/marian}
|
||||
MARIAN_HOME ?= $(dir ${MARIAN})
|
||||
SPM_HOME ?= ${dir ${MARIAN}}
|
||||
FASTALIGN ?= ${shell which fast_align || echo ${PWD}/tools/fast_align/build/fast_align}
|
||||
FASTALIGN_HOME ?= ${dir ${FASTALIGN}}
|
||||
ATOOLS ?= ${FASTALIGN_HOME}atools
|
||||
WORDALIGN ?= ${EFLOMAL_HOME}align.py
|
||||
MOSESSCRIPTS ?= ${PWD}/tools/moses-scripts/scripts
|
||||
|
||||
|
||||
## marian-nmt binaries
|
||||
|
||||
MARIAN_TRAIN = ${MARIAN_HOME}marian
|
||||
@ -105,13 +124,7 @@ MARIAN_DECODER = ${MARIAN_HOME}marian-decoder
|
||||
MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
|
||||
|
||||
|
||||
## other tools and their locations
|
||||
|
||||
SCRIPTDIR = ${PWD}/scripts
|
||||
WORDALIGN = ${EFLOMAL_HOME}align.py
|
||||
ATOOLS = ${FASTALIGN_HOME}atools
|
||||
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
|
||||
SNMTPATH = ${APPLHOME}/subword-nmt/subword_nmt
|
||||
|
||||
@ -120,14 +133,15 @@ SPM_TRAIN = ${SPM_HOME}spm_train
|
||||
SPM_ENCODE = ${SPM_HOME}spm_encode
|
||||
|
||||
|
||||
SORT = sort -T ${TMPDIR} --parallel=${THREADS}
|
||||
SHUFFLE = ${shell which terashuf 2>/dev/null}
|
||||
ifeq (${SHUFFLE},)
|
||||
SHUFFLE = ${SORT} --random-sort
|
||||
endif
|
||||
GZIP := ${shell which pigz 2>/dev/null}
|
||||
GZIP ?= gzip
|
||||
ZCAT = ${GZIP} -cd <
|
||||
SORT := sort -T ${TMPDIR} --parallel=${THREADS}
|
||||
SHUFFLE := ${shell which ${TERASHUF} || echo "${SORT} --random-sort"}
|
||||
GZIP := ${shell which ${PIGZ} || echo gzip}
|
||||
GZCAT := ${GZIP} -cd
|
||||
ZCAT := gzip -cd
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# TODO: delete those?
|
||||
@ -143,10 +157,50 @@ MULTEVALHOME = ${APPLHOME}/multeval
|
||||
## * marian-nmt
|
||||
|
||||
|
||||
PIP := ${shell which pip3 2>/dev/null}
|
||||
PIP ?= pip
|
||||
PREREQ_TOOLS := ${ISO639} ${ATOOLS} ${PIGZ} ${TERASHUF} ${MARIAN}
|
||||
|
||||
PIP := ${shell which pip3 2>/dev/null || echo pip}
|
||||
CPAN := ${shell which cpanm 2>/dev/null || echo cpan}
|
||||
|
||||
NVIDIA_SMI := ${shell which nvidia-smi 2>/dev/null}
|
||||
ifneq ($(wildcard ${NVIDIA_SMI}),)
|
||||
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
|
||||
MARIAN_BUILD_OPTIONS=-DCOMPILE_CUDA=off
|
||||
endif
|
||||
else
|
||||
MARIAN_BUILD_OPTIONS=-DCOMPILE_CUDA=off
|
||||
endif
|
||||
|
||||
|
||||
PHONY: install-prerequisites install-prereq install-requirements
|
||||
install-prerequisites install-prereq install-requirements:
|
||||
${PIP} install --user -r requirements.txt
|
||||
${MAKE} ${PREREQ_TOOLS}
|
||||
|
||||
|
||||
tools/LanguageCodes/ISO-639-3/bin/iso639:
|
||||
${MAKE} tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm
|
||||
|
||||
tools/LanguageCodes/ISO-639-5/lib/ISO/639/5.pm:
|
||||
${MAKE} -C tools/LanguageCodes all
|
||||
|
||||
tools/fast_align/build/atools:
|
||||
mkdir -p ${dir $@}
|
||||
cd ${dir $@} && cmake ..
|
||||
${MAKE} -C ${dir $@}
|
||||
|
||||
tools/pigz/pigz:
|
||||
${MAKE} -C ${dir $@}
|
||||
|
||||
tools/terashuf/terashuf:
|
||||
${MAKE} -C ${dir $@}
|
||||
|
||||
|
||||
## For Mac users: install protobuf
|
||||
##
|
||||
## sudo port install protobuf3-cpp
|
||||
|
||||
tools/marian-dev/build/marian:
|
||||
mkdir -p ${dir $@}
|
||||
cd ${dir $@} && cmake -DUSE_SENTENCEPIECE=on ${MARIAN_BUILD_OPTIONS} ..
|
||||
${MAKE} -C ${dir $@} -j
|
||||
|
@ -75,7 +75,7 @@ all2pivot:
|
||||
train-dynamic:
|
||||
if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
|
||||
${MAKE} data; \
|
||||
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
s=`${ZCAT} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
if [ $$s -gt 10000000 ]; then \
|
||||
echo "${LANGPAIRSTR} bigger than 10 million"; \
|
||||
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||
@ -300,8 +300,7 @@ endif
|
||||
|
||||
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
|
||||
%-cpu:
|
||||
${MAKE} MARIAN=${MARIANCPU} \
|
||||
LOADMODS='${LOADCPU}' \
|
||||
${MAKE} LOADMODS='${LOADCPU}' \
|
||||
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
|
||||
${@:-cpu=}
|
||||
|
||||
|
@ -7,7 +7,7 @@ ALL_DATA_SETS = ${patsubst %.${SRCEXT}.gz,%,${CLEAN_TRAIN_SRC}}
|
||||
|
||||
check-bitext-length:
|
||||
for d in ${ALL_DATA_SETS}; do \
|
||||
if [ `zcat $$d.${SRCEXT}.gz | wc -l` != `zcat $$d.${TRGEXT}.gz | wc -l` ]; then \
|
||||
if [ `${ZCAT} $$d.${SRCEXT}.gz | wc -l` != `${ZCAT} $$d.${TRGEXT}.gz | wc -l` ]; then \
|
||||
echo "not the same number of lines in $$d"; \
|
||||
fi \
|
||||
done
|
||||
|
@ -118,7 +118,6 @@ TATOEBA_PARAMS := TRAINSET=Tatoeba-train \
|
||||
|
||||
|
||||
|
||||
ISO639 := iso639
|
||||
GET_ISO_CODE := ${ISO639} -m
|
||||
|
||||
## taken from the Tatoeba-Challenge Makefile
|
||||
@ -577,7 +576,7 @@ tatoeba-trainsize-%.txt: tatoeba-%.md
|
||||
s=`echo $$l | cut -f1 -d '-'`; \
|
||||
t=`echo $$l | cut -f2 -d '-'`; \
|
||||
echo -n "$$l " >> $@; \
|
||||
zcat ${TATOEBA_DATA}/Tatoeba-train.$$l.clean.$$s.gz | wc -l >> $@; \
|
||||
${GZCAT} ${TATOEBA_DATA}/Tatoeba-train.$$l.clean.$$s.gz | wc -l >> $@; \
|
||||
done
|
||||
|
||||
|
||||
@ -806,7 +805,7 @@ ${TATOEBA_MONO}/%.labels:
|
||||
wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
|
||||
tar -C $@.d -xf $@.d/mono.tar
|
||||
rm -f $@.d/mono.tar
|
||||
find $@.d -name '*.id.gz' | xargs zcat | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
|
||||
find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
|
||||
for c in `find $@.d -name '*.id.gz' | sed 's/\.id\.gz//'`; do \
|
||||
echo "extract all data from $$c.txt.gz"; \
|
||||
${GZIP} -d $$c.id.gz; \
|
||||
@ -845,19 +844,19 @@ ${TATOEBA_MONO}/%.labels:
|
||||
mv $@.d/data/${LANGPAIR}/dev.trg ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
cat $@.d/data/${LANGPAIR}/dev.id $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
|
||||
if [ -e $@.d/data/${LANGPAIR}/train.src.gz ]; then \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.src.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.trg.gz > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.id.gz | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
|
||||
fi; \
|
||||
else \
|
||||
if [ -e $@.d/data/${LANGPAIR}/train.src.gz ]; then \
|
||||
echo "no devdata available - get top 1000 from training data!"; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${ZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.src.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | head -1000 > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.id.gz | head -1000 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-dev.${LANGPAIR}.clean.id; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.src.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.trg.gz | tail -n +1001 > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
${GZCAT} $@.d/data/${LANGPAIR}/train.id.gz | tail -n +1001 | cut -f2,3 $(FIXLANGIDS) > ${dir $@}Tatoeba-train.${LANGPAIR}.clean.id; \
|
||||
fi \
|
||||
fi
|
||||
## make sure that training data file exists even if it is empty
|
||||
|
@ -26,13 +26,13 @@ endif
|
||||
|
||||
${DATADIR}/${PRE}/WikiMatrix.${WIKIMATRIX_SCORE}.${LANGPAIR}.${SRCEXT}.raw:
|
||||
mkdir -p ${dir $@}
|
||||
zcat ${WIKIMATRIX_DATA} | \
|
||||
${ZCAT} ${WIKIMATRIX_DATA} | \
|
||||
awk '{if($$1>${WIKIMATRIX_SCORE})print}' | \
|
||||
cut -f${WIKIMATRIX_SRCFIELD} > $@
|
||||
|
||||
${DATADIR}/${PRE}/WikiMatrix.${WIKIMATRIX_SCORE}.${LANGPAIR}.${TRGEXT}.raw:
|
||||
mkdir -p ${dir $@}
|
||||
zcat ${WIKIMATRIX_DATA} | \
|
||||
${ZCAT} ${WIKIMATRIX_DATA} | \
|
||||
awk '{if($$1>${WIKIMATRIX_SCORE})print}' | \
|
||||
cut -f${WIKIMATRIX_TRGFIELD} > $@
|
||||
|
||||
|
@ -25,8 +25,8 @@
|
||||
cut -f1 $@.bitext | ${GZIP} -c > $@
|
||||
cut -f2 $@.bitext | ${GZIP} -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
|
||||
rm -f $@.bitext $@.1 $@.2
|
||||
if [ ! `zcat "$@" | head | wc -l` -gt 0 ]; then rm -f $@; fi
|
||||
if [ ! `zcat "$(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)" | head | wc -l` -gt 0 ]; then \
|
||||
if [ ! `${ZCAT} "$@" | head | wc -l` -gt 0 ]; then rm -f $@; fi
|
||||
if [ ! `${ZCAT} "$(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)" | head | wc -l` -gt 0 ]; then \
|
||||
rm -f $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz); \
|
||||
fi
|
||||
|
||||
@ -46,21 +46,21 @@
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
%.zh_cn.tok: %.zh_cn.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
%.zh.tok: %.zh.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
## generic target for tokenization
|
||||
%.tok: %.raw
|
||||
@ -71,7 +71,7 @@
|
||||
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
|
||||
$(TOKENIZER)/tokenizer.perl -a -threads $(THREADS) \
|
||||
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
|
||||
|
||||
@ -84,28 +84,28 @@
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
|
||||
%.norm: %.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
%.${SRCEXT}.norm: %.${SRCEXT}.raw
|
||||
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
%.${TRGEXT}.norm: %.${TRGEXT}.raw
|
||||
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
|
||||
## minimal pre-processing
|
||||
@ -114,28 +114,28 @@
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
|
||||
%.simple: %.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
%.${SRCEXT}.simple: %.${SRCEXT}.raw
|
||||
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
%.${TRGEXT}.simple: %.${TRGEXT}.raw
|
||||
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
||||
|
||||
|
||||
|
||||
@ -145,7 +145,7 @@
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
|
||||
sed 's/ /▁/g' > $@
|
||||
|
||||
|
||||
@ -172,21 +172,21 @@
|
||||
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
# $(TOKENIZER)/normalize-punctuation.perl |\
|
||||
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
|
||||
# %.simple.gz: %.gz
|
||||
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
|
||||
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
# $(TOKENIZER)/deescape-special-chars.perl |\
|
||||
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
||||
|
||||
# %.nospace.gz: %.gz
|
||||
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
|
||||
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
# $(TOKENIZER)/deescape-special-chars.perl |\
|
||||
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
# sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
|
||||
# sed 's/ /▁/g' |\
|
||||
# ${GZIP} -c > $@
|
||||
|
||||
|
@ -32,10 +32,9 @@ endif
|
||||
echo '#SBATCH -n 1' >> $@
|
||||
echo '#SBATCH -N 1' >> $@
|
||||
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
|
||||
ifeq (${shell hostname --domain},bullx)
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
echo '#SBATCH --account=${CSCPROJECT}' >> $@
|
||||
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}' >> $@
|
||||
# echo '#SBATCH --exclude=r18g02' >> $@
|
||||
else
|
||||
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS}' >> $@
|
||||
endif
|
||||
@ -70,12 +69,9 @@ ifdef EMAIL
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> $@
|
||||
endif
|
||||
ifeq (${shell hostname --domain},bullx)
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
echo '#SBATCH --account=${CSCPROJECT}' >> $@
|
||||
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@
|
||||
# echo '#SBATCH --exclude=r05c49' >> $@
|
||||
# echo '#SBATCH --exclude=r07c51' >> $@
|
||||
# echo '#SBATCH --exclude=r06c50' >> $@
|
||||
endif
|
||||
echo '#SBATCH -n ${HPC_CORES}' >> $@
|
||||
echo '#SBATCH -N ${HPC_NODES}' >> $@
|
||||
|
@ -30,7 +30,7 @@ ifneq (${MODEL_LATEST_VOCAB},)
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
else
|
||||
mkdir -p ${dir $@}
|
||||
${LOADMODS} && zcat $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
|
||||
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
|
||||
endif
|
||||
else
|
||||
@echo "$@ already exists!"
|
||||
|
@ -229,7 +229,7 @@ ${TRANSLATED_PRE}: ${ORIGINAL_DATASRC}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${DECODER}
|
||||
zcat $< |\
|
||||
${GZCAT} $< |\
|
||||
head -${MAX_PIVOT_SENTENCES} |\
|
||||
${OUTPUT_DIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
gzip -c > $@
|
||||
@ -241,7 +241,7 @@ ${OUTPUT_DIR}/%.${MODELNAME}.${LANGPAIR}.${PIVOT}.spm.gz: ${ORIGINAL_DATADIR}/${
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${DECODER}
|
||||
zcat $< |\
|
||||
${GZCAT} $< |\
|
||||
${OUTPUT_DIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
gzip -c > $@
|
||||
endif
|
||||
@ -299,9 +299,9 @@ check-length:
|
||||
for S in `ls $$d/*.$$s.gz`; do \
|
||||
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
|
||||
echo "$$S -- $$T"; \
|
||||
zcat $$S | wc -l; \
|
||||
zcat $$T | wc -l; \
|
||||
if [ `zcat $$S | wc -l` != `zcat $$T | wc -l` ]; then \
|
||||
${GZCAT} $$S | wc -l; \
|
||||
${GZCAT} $$T | wc -l; \
|
||||
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
|
||||
echo "$$S != $$T"; \
|
||||
fi \
|
||||
done \
|
||||
|
1
tools/LanguageCodes
Submodule
1
tools/LanguageCodes
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit d19d3ff363c611a23a52db62aecc0d502d2c058d
|
1
tools/OpusTools-perl
Submodule
1
tools/OpusTools-perl
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 156b0c5119b28b81232a8276e4fa3df04afbc7d0
|
1
tools/eflomal
Submodule
1
tools/eflomal
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 7b97f19187c8b1bc1f21aefd77fc1b87575d1c00
|
1
tools/fast_align
Submodule
1
tools/fast_align
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit cab1e9aac8d3bb02ff5ae58218d8d225a039fa11
|
1
tools/marian-dev
Submodule
1
tools/marian-dev
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 4d9d15649e83766fd2ee2a79db79e0d8a2fed3c4
|
1
tools/moses-scripts
Submodule
1
tools/moses-scripts
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 958dd5a6b026197de988c0264d45112215bc37bd
|
1
tools/pigz
Submodule
1
tools/pigz
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit fe822cb435622c43f491013da77b127e9fe851a9
|
1
tools/terashuf
Submodule
1
tools/terashuf
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 6b0a8b0c2614c9af687d2bdb1851db89fa1cbf38
|
Loading…
Reference in New Issue
Block a user