fix in reverse-data

This commit is contained in:
Joerg Tiedemann 2022-01-06 23:48:34 +02:00
parent 1c48468a15
commit ed1bde6ac5
7 changed files with 198 additions and 65 deletions

View File

@ -593,12 +593,20 @@ MARIAN_MINI_BATCH = 768
MARIAN_MAXI_BATCH = 2048
ifeq ($(GPU_AVAILABLE),1)
MARIAN_SCORER_FLAGS = -n1 -d ${MARIAN_GPUS} \
--quiet-translation -w ${MARIAN_DECODER_WORKSPACE} \
--mini-batch ${MARIAN_MINI_BATCH} --maxi-batch ${MARIAN_MAXI_BATCH} --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = -b ${MARIAN_BEAM_SIZE} -n1 -d ${MARIAN_GPUS} \
--quiet-translation -w ${MARIAN_DECODER_WORKSPACE} \
--mini-batch ${MARIAN_MINI_BATCH} --maxi-batch ${MARIAN_MAXI_BATCH} --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
# --fp16
else
MARIAN_SCORER_FLAGS = -n1 --cpu-threads ${HPC_CORES} \
--quiet-translation \
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = -b ${MARIAN_BEAM_SIZE} -n1 --cpu-threads ${HPC_CORES} \
--quiet-translation \
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \

View File

@ -158,15 +158,13 @@ ifeq (${words ${TRGLANGS}},1)
ln -s ${TRAIN_TRG}.clean.${PRE_TRG}.gz ${REV_WORKDIR}/train/${notdir ${TRAIN_SRC}.clean.${PRE_SRC}.gz}; \
cp ${WORKDIR}/train/README.md ${REV_WORKDIR}/train/README.md; \
fi
-if [ -e ${SPMSRCMODEL} ]; then \
ln -s ${SPMSRCMODEL} ${REV_WORKDIR}/train/${notdir ${SPMTRGMODEL}}; \
ln -s ${SPMTRGMODEL} ${REV_WORKDIR}/train/${notdir ${SPMSRCMODEL}}; \
ln -s ${SPMSRCMODEL}.vocab ${REV_WORKDIR}/train/${notdir ${SPMTRGMODEL}}.vocab; \
ln -s ${SPMTRGMODEL}.vocab ${REV_WORKDIR}/train/${notdir ${SPMSRCMODEL}}.vocab; \
-if [ -e ${SUBWORD_SRC_MODEL} ]; then \
ln -s ${SUBWORD_SRC_MODEL} ${REV_WORKDIR}/train/${notdir ${SUBWORD_TRG_MODEL}}; \
ln -s ${SUBWORD_TRG_MODEL} ${REV_WORKDIR}/train/${notdir ${SUBWORD_SRC_MODEL}}; \
fi
if [ -e ${BPESRCMODEL} ]; then \
ln -s ${BPESRCMODEL} ${REV_WORKDIR}/train/${notdir ${BPETRGMODEL}}; \
ln -s ${BPETRGMODEL} ${REV_WORKDIR}/train/${notdir ${BPESRCMODEL}}; \
-if [ -e ${SUBWORD_SRC_MODEL}.vocab ]; then \
ln -s ${SUBWORD_SRC_MODEL}.vocab ${REV_WORKDIR}/train/${notdir ${SUBWORD_TRG_MODEL}}.vocab; \
ln -s ${SUBWORD_TRG_MODEL}.vocab ${REV_WORKDIR}/train/${notdir ${SUBWORD_SRC_MODEL}}.vocab; \
fi
-if [ -e ${TRAIN_ALG} ]; then \
if [ ! -e ${REV_WORKDIR}/train/${notdir ${TRAIN_ALG}} ]; then \
@ -191,6 +189,10 @@ ifeq (${words ${TRGLANGS}},1)
ln -s ${TEST_TRG} ${REV_WORKDIR}/test/${notdir ${TEST_SRC}}; \
cp ${WORKDIR}/test/README.md ${REV_WORKDIR}/test/README.md; \
fi
-if [ -e ${MODEL_SRCVOCAB} ]; then \
ln -s ${MODEL_SRCVOCAB} ${REV_WORKDIR}/${notdir ${MODEL_TRGVOCAB}}; \
ln -s ${MODEL_TRGVOCAB} ${REV_WORKDIR}/${notdir ${MODEL_SRCVOCAB}}; \
fi
-if [ -e ${MODEL_VOCAB} ]; then \
ln -s ${MODEL_VOCAB} ${REV_WORKDIR}/${notdir ${MODEL_VOCAB}}; \
fi
@ -198,11 +200,11 @@ ifeq (${words ${TRGLANGS}},1)
## this is a bit dangerous with some trick to
## swap parameters between SRC and TRG
##
-if [ -e ${WORKDIR}/config.mk ]; then \
if [ ! -e ${REV_WORKDIR}/config.mk ]; then \
cat ${WORKDIR}/config.mk |\
-if [ -e ${WORKDIR}/${MODELCONFIG} ]; then \
if [ ! -e ${REV_WORKDIR}/${MODELCONFIG} ]; then \
cat ${WORKDIR}/${MODELCONFIG} |\
sed -e 's/SRC/TTT/g;s/TRG/SRC/g;s/TTT/TRG/' |\
grep -v LANGPAIRSTR > ${REV_WORKDIR}/config.mk; \
grep -v LANGPAIRSTR > ${REV_WORKDIR}/$(notdir ${MODELCONFIG}); \
fi \
fi
endif
@ -307,7 +309,7 @@ $(LOCAL_TRAIN_SRC).algtmp.d/%.alg: $(LOCAL_TRAIN_SRC).algtmp.d/% $(LOCAL_TRAIN_T
-r $(word 2,$^).rev
echo "merge and symmetrize part ${notdir $<}"
${ATOOLS} -c grow-diag-final -i $(word 1,$^).fwd -j $(word 2,$^).rev > $@
rm -f $(word 2,$^).fwd $(word 2,$^).rev
rm -f $(word 1,$^).fwd $(word 2,$^).rev
@ -517,8 +519,8 @@ endif
show-devdata:
@echo "${CLEAN_DEV_SRC}"
@echo "${CLEAN_DEV_TRG}"
@echo ${SPMSRCMODEL}
@echo ${SPMTRGMODEL}
@echo ${SUBWORD_SRC_MODEL}
@echo ${SUBWORD_TRG_MODEL}
@echo "${DEV_SRC}.${PRE_SRC}"
@echo "${DEV_TRG}.${PRE_TRG}"

View File

@ -57,26 +57,28 @@ else ifeq (${shell hostname --domain 2>/dev/null},bullx)
endif
## default settings for CPU cores
## default settings for CPU cores to be used
CPU_CORES ?= ${CORES}
THREADS ?= ${CPU_CORES}
JOBS ?= ${THREADS}
## set variables with HPC prefix
## (this is mostly for backwards compatibility)
HPC_TIME ?= ${WALLTIME}:00
HPC_CORES ?= ${CPU_CORES}
HPC_THREADS ?= ${HPC_CORES}
HPC_JOBS ?= ${HPC_THREADS}
HPC_MEM ?= ${MEM}
## number parallel jobs in make
## (for slurm jobs)
print_hpc:
@echo ${HPC_MEM}
@echo ${HPC_CORES}
@echo ${THREADS}
@echo ${HPC_JOBS}
ifdef JOBS
HPC_JOBS ?= ${JOBS}
else
JOBS ?= ${THREADS}
HPC_JOBS ?= ${HPC_THREADS}
endif
SUBMIT_PREFIX ?= submit
@ -120,6 +122,7 @@ TMX2MOSES ?= ${shell which tmx2moses 2>/dev/null || echo ${TOOLSDIR}/OpusTo
MARIAN_TRAIN = ${MARIAN_HOME}marian
MARIAN_DECODER = ${MARIAN_HOME}marian-decoder
MARIAN_SCORER = ${MARIAN_HOME}marian-scorer
MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab

View File

@ -262,11 +262,12 @@ endif
endif
${MAKE} DATASET=${DATASET}+bt \
USE_BACKTRANS=1 \
MODELCONFIG=config-bt.mk \
CONTINUE_EXISTING=${BT_CONTINUE_EXISTING} \
MARIAN_EARLY_STOPPING=${BT_MARIAN_EARLY_STOPPING} \
${@:-bt=}
# MODELCONFIG=config-bt.mk \
## adding a pivot language to the model
## --> add pivot language to each side (source and target)
@ -281,7 +282,6 @@ PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
%-pivotlang:
if [ "$(sort ${SRCLANGS} ${TRGLANGS} ${PIVOT_LANG})" != "$(sort ${SRCLANGS} ${TRGLANGS})" ]; then \
${MAKE} DATASET=${DATASET}+${PIVOT_LANG} \
MODELCONFIG=${MODELCONFIG:.mk=+${PIVOT_LANG}.mk} \
SRCLANGS="$(sort ${SRCLANGS} ${PIVOT_LANG})" \
TRGLANGS="$(sort ${TRGLANGS} ${PIVOT_LANG})" \
SKIP_LANGPAIRS=${PIVOT_LANG}-${PIVOT_LANG} \
@ -291,6 +291,8 @@ PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
${@:-pivotlang=}; \
fi
# MODELCONFIG=${MODELCONFIG:.mk=+${PIVOT_LANG}.mk} \
## add forward translations
@ -313,10 +315,12 @@ endif
${MAKE} DATASET=${DATASET}+ft \
USE_FORWARDTRANS=1 \
CONTINUE_EXISTING=1 \
MODELCONFIG=config-ft.mk \
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
${@:-ft=}
# MODELCONFIG=config-ft.mk \
## add forward translation of monolingual data
%-ftmono:
${MAKE} DATASET=${DATASET}+ftmono USE_FORWARDTRANSMONO=1 ${@:-ftmono=}
@ -366,11 +370,11 @@ endif
${MAKE} DATASET=${DATASET}+ftonly \
USE_FORWARDTRANS=1 \
CONTINUE_EXISTING=1 \
MODELCONFIG=config-ft.mk \
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
TRAINSET= TATOEBA_TRAINSET= \
${@:-ftonly=}
# MODELCONFIG=config-ft.mk \
## NEW: don't continue from existing models when including pivot data

38
scripts/normalize-scores.py Executable file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# taken from https://github.com/browsermt/students
from __future__ import print_function, unicode_literals, division
import sys
import argparse
import math
def main():
args = parse_user_args()
for line in sys.stdin:
fields = line.strip().split("\t")
trg = fields[-1]
score = float(fields[0])
if not args.no_normalize:
length = len(trg.split())
score = score / float(length + 1)
if args.exp:
score = math.exp(score)
sys.stdout.write("{:.6f}\t{}".format(score, line))
def parse_user_args():
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--no-normalize", action="store_true")
parser.add_argument("-e", "--exp", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
main()

View File

@ -1844,38 +1844,38 @@ TATOEBADATA = data/release/${TATOEBA_VERSION}/${LANGPAIR}
## make dev data (extract additional examples from the training data if neccessary)
%.gz.d/devdata.created: %.gz.d/data.fetched
@if [ -e ${dir $@}/${TATOEBADATA}/dev.src ]; then \
if [ `cat ${dir $@}/${TATOEBADATA}/dev.src | wc -l` -gt 50 ]; then \
@if [ -e ${dir $@}${TATOEBADATA}/dev.src ]; then \
if [ `cat ${dir $@}${TATOEBADATA}/dev.src | wc -l` -gt 50 ]; then \
touch $@; \
else \
mv ${dir $@}/${TATOEBADATA}/dev.src $@.dev.src; \
mv ${dir $@}/${TATOEBADATA}/dev.trg $@.dev.trg; \
mv ${dir $@}/${TATOEBADATA}/dev.id $@.dev.id; \
mv ${dir $@}${TATOEBADATA}/dev.src $@.dev.src; \
mv ${dir $@}${TATOEBADATA}/dev.trg $@.dev.trg; \
mv ${dir $@}${TATOEBADATA}/dev.id $@.dev.id; \
fi \
fi
@if [ ! -e $@ ]; then \
if [ -e ${dir $@}/${TATOEBADATA}/train.src.gz ]; then \
if [ -e ${dir $@}${TATOEBADATA}/train.src.gz ]; then \
echo "........ too little devdata available - get top 1000 from training data!"; \
${GZCAT} $@.d/${TATOEBADATA}/train.src.gz | head -1000 >> $@.dev.src; \
${GZCAT} $@.d/${TATOEBADATA}/train.trg.gz | head -1000 >> $@.dev.trg; \
${GZCAT} $@.d/${TATOEBADATA}/train.id.gz | head -1000 | cut -f2,3 >> $@.dev.id; \
${GZCAT} $@.d/${TATOEBADATA}/train.src.gz | tail -n +1001 | ${GZIP} -f > $@.src.gz; \
${GZCAT} $@.d/${TATOEBADATA}/train.trg.gz | tail -n +1001 | ${GZIP} -f > $@.trg.gz; \
${GZCAT} $@.d/${TATOEBADATA}/train.id.gz | tail -n +1001 | ${GZIP} -f > $@.id.gz; \
mv $@.src.gz $@.d/${TATOEBADATA}/train.src.gz; \
mv $@.trg.gz $@.d/${TATOEBADATA}/train.trg.gz; \
mv $@.id.gz $@.d/${TATOEBADATA}/train.id.gz; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.src.gz | head -1000 >> $@.dev.src; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.trg.gz | head -1000 >> $@.dev.trg; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | head -1000 | cut -f2,3 >> $@.dev.id; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.src.gz | tail -n +1001 | ${GZIP} -f > $@.src.gz; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.trg.gz | tail -n +1001 | ${GZIP} -f > $@.trg.gz; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | tail -n +1001 | ${GZIP} -f > $@.id.gz; \
mv $@.src.gz ${dir $@}${TATOEBADATA}/train.src.gz; \
mv $@.trg.gz ${dir $@}${TATOEBADATA}/train.trg.gz; \
mv $@.id.gz ${dir $@}${TATOEBADATA}/train.id.gz; \
fi; \
mv $@.dev.src ${dir $@}/${TATOEBADATA}/dev.src; \
mv $@.dev.trg ${dir $@}/${TATOEBADATA}/dev.trg; \
mv $@.dev.id ${dir $@}/${TATOEBADATA}/dev.id; \
mv $@.dev.src ${dir $@}${TATOEBADATA}/dev.src; \
mv $@.dev.trg ${dir $@}${TATOEBADATA}/dev.trg; \
mv $@.dev.id ${dir $@}${TATOEBADATA}/dev.id; \
touch $@; \
fi
## fix language IDs and make sure that dev/test/train exist
%.gz.d/data.fixed: %.gz.d/devdata.created
@echo ".... fix language codes"
@if [ -e ${dir $@}/${TATOEBADATA}/train.id.gz ]; then \
@if [ -e ${dir $@}${TATOEBADATA}/train.id.gz ]; then \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f2,3 $(FIXLANGIDS) | ${GZIP} -c > ${dir $@}train.id.gz; \
${GZCAT} ${dir $@}${TATOEBADATA}/train.id.gz | cut -f1 | ${GZIP} -c > ${dir $@}train.domain.gz; \
mv ${dir $@}train.id.gz ${dir $@}train.domain.gz ${dir $@}${TATOEBADATA}/; \
@ -1884,8 +1884,8 @@ TATOEBADATA = data/release/${TATOEBA_VERSION}/${LANGPAIR}
touch ${dir $@}${TATOEBADATA}/train.id ${dir $@}${TATOEBADATA}/train.domain; \
${GZIP} -cd ${dir $@}${TATOEBADATA}/train.*; \
fi
@touch ${dir $@}/${TATOEBADATA}/test.id ${dir $@}/${TATOEBADATA}/test.src ${dir $@}/${TATOEBADATA}/test.trg
@touch ${dir $@}/${TATOEBADATA}/dev.id ${dir $@}/${TATOEBADATA}/dev.src ${dir $@}/${TATOEBADATA}/dev.trg
@touch ${dir $@}${TATOEBADATA}/test.id ${dir $@}${TATOEBADATA}/test.src ${dir $@}${TATOEBADATA}/test.trg
@touch ${dir $@}${TATOEBADATA}/dev.id ${dir $@}${TATOEBADATA}/dev.src ${dir $@}${TATOEBADATA}/dev.trg
@cat ${dir $@}${TATOEBADATA}/dev.id $(FIXLANGIDS) > ${dir $@}dev.id
@cat ${dir $@}${TATOEBADATA}/test.id $(FIXLANGIDS) > ${dir $@}test.id
@mv ${dir $@}dev.id ${dir $@}test.id ${dir $@}${TATOEBADATA}/

View File

@ -73,7 +73,7 @@ MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<' | head -1}
endif
RELEASED_BITEXTS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
@ -112,6 +112,9 @@ ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANG
.PHONY: all
all: translate
.PHONY: mtmodel
mtmodel: ${LANGPAIR}/${MODELNAME}/decoder.yml
.PHONY: prepare
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${BITEXT_PRE}
@ -136,6 +139,75 @@ print-modelinfo:
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
@echo "target language label: ${TARGET_LANG_LABEL}"
##-------------------------------------------
## translation model in reverse direction
## --> for scoring translations
##-------------------------------------------
REV_LANGPAIR = ${TRG}-${SRC}
REV_MODELZIP := ${shell wget -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}}
REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
REV_MULTI_TARGET_MODEL := ${shell wget -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l}
ifeq (${REV_MULTI_TARGET_MODEL},1)
REV_PREPROCESS_ARGS = ${TRG} ${SRC} ${REV_LANGPAIR}/${REV_MODELNAME}/source.spm
else
REV_PREPROCESS_ARGS = ${TRG} ${REV_LANGPAIR}/${REV_MODELNAME}/source.spm
endif
## score translations with reverse translation model
## normalize scores (see https://github.com/browsermt/students)
SCOREFILES = $(patsubst %.${SRC}.gz,%.${SRC}.scores.gz,$(wildcard ${OUTPUT_DIR}/latest/*.${SRC}.gz))
.PHONY: score-translation score-translations
score-translation: ${BITEXT_LATEST_SRC:.gz=.scores.gz}
score-translations: ${SCOREFILES}
sort-scored-translations: ${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz
print-score-file:
echo ${BITEXT_LATEST_SRC:.gz=.scores.gz}
${OUTPUT_DIR}/latest/%.${SRC}.scores.gz: ${OUTPUT_DIR}/latest/%.${SRC}.gz
${MAKE} SRC=${TRG} TRG=${SRC} mtmodel
${GZCAT} ${<:.${SRC}.gz=.${TRG}.gz} |\
${REV_LANGPAIR}/${REV_MODELNAME}/preprocess.sh ${REV_PREPROCESS_ARGS} |\
${GZIP} -c > $@.src.gz
${GZCAT} $< |\
${SPM_ENCODE} --model=${REV_LANGPAIR}/${REV_MODELNAME}/target.spm |\
${GZIP} -c > $@.trg.gz
${LOAD_ENV} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_SCORER} \
-m `grep -A1 models decoder.yml | tail -1 | sed 's/ *- //'` \
-v `grep -A2 vocabs decoder.yml | tail -2 | sed 's/ *- //' | tr "\n" ' '` \
-t ${PWD}/$@.src.gz ${PWD}/$@.trg.gz \
-d ${MARIAN_GPUS} \
${MARIAN_SCORER_FLAGS} |\
${GZIP} -c > ${PWD}/$(@:.scores.gz=.raw-scores.gz)
paste <(gzip -dc $(@:.scores.gz=.raw-scores.gz)) <(gzip -dc $@.trg.gz) | \
python3 ${SCRIPTDIR}/normalize-scores.py | cut -f1 | ${GZIP} -c > $@
rm -f $@.src.gz $@.trg.gz
${OUTPUT_DIR}/latest/Tatoeba-train.sorted.gz: ${SCOREFILES}
${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.scores.gz | ${GZIP} -c > $@.scores.gz
${GZCAT} ${OUTPUT_DIR}/latest/*.${SRC}.gz | ${GZIP} -c > $@.src.gz
${GZCAT} ${OUTPUT_DIR}/latest/*.${TRG}.gz | ${GZIP} -c > $@.trg.gz
paste <(gzip -cd $@.scores.gz) <(gzip -cd $@.src.gz) <(gzip -cd $@.trg.gz) |\
LC_ALL=C sort -n -k1,1 -S 10G | uniq -f1 | ${GZIP} -c > $@
rm -f $@.src.gz $@.trg.gz
#
# (see https://github.com/browsermt/students)
#
#scored/%.best.gz: scored/%.sorted.gz
# $(eval STARTLINE := $(shell pigz -dc $< | wc -l | sed "s|$$|*$(REMOVE)|" | bc | cut -f1 -d.))
# @echo Removing $(REMOVE) removes $(STARTLINE) lines
# pigz -dc $< | tail -n +$(STARTLINE) | cut -f2,3 | pigz > $@
## fetch the latest model
${LANGPAIR}/${MODELNAME}/decoder.yml:
@ -159,8 +231,10 @@ else
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif
ifeq (${BITEXT_SRCPRE},)
## OLD: check whether we have source files in the work directory
## NEW: make them from scratch from raw bitexts
##
# ifeq (${BITEXT_SRCPRE},)
${BITEXT_SRCRAW}:
${MAKE} -C .. SRCLANGS=${SRC} TRGLANGS=${TRG} clean-data-tatoeba
@ -177,22 +251,26 @@ ifneq (${MODELZIP},)
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
endif
else
${BITEXT_PRE}: ${BITEXT_SRCPRE}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${GZCAT} $< |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
grep -v '[<>{}]' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
endif
## NEW: skip this option
##
# else
# ${BITEXT_PRE}: ${BITEXT_SRCPRE}
# ifneq (${MODELZIP},)
# mkdir -p ${dir $@}
# ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
# ${GZCAT} $< |\
# sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
# grep -v '[<>{}]' |\
# ${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
# perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
# split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
# ${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
# endif
# endif
endif
## merge SentencePiece segments in the source text