mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
new mode: SentencePieceModels trained on monolingual data
This commit is contained in:
parent
ee8c27e3db
commit
811815064b
@ -18,11 +18,26 @@ ifndef TRG
|
||||
endif
|
||||
|
||||
|
||||
|
||||
# sorted languages and langpair used to match resources in OPUS
|
||||
SORTLANGS = $(sort ${SRC} ${TRG})
|
||||
SPACE = $(empty) $(empty)
|
||||
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
||||
LANGSTR = ${subst ${SPACE},+,$(SRCLANGS)}-${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
|
||||
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
|
||||
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
|
||||
|
||||
|
||||
## for monolingual things
|
||||
ifndef LANGS
|
||||
LANGS := ${SRCLANGS}
|
||||
endif
|
||||
ifndef LANGID
|
||||
LANGID := ${firstword ${LANGS}}
|
||||
endif
|
||||
ifndef LANGSTR
|
||||
LANGSTR = ${subst ${SPACE},+,$(LANGS)}
|
||||
endif
|
||||
|
||||
|
||||
## for same language pairs: add numeric extension
|
||||
@ -43,6 +58,11 @@ OPUSCORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
|
||||
${patsubst ${OPUSHOME}/%,%,\
|
||||
${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}}
|
||||
|
||||
## monolingual data
|
||||
OPUSMONOCORPORA = ${patsubst %/latest/mono/${LANGID}.txt.gz,%,\
|
||||
${patsubst ${OPUSHOME}/%,%,\
|
||||
${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}}
|
||||
|
||||
|
||||
ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
|
||||
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
|
||||
@ -119,7 +139,7 @@ EXTRA_TRAINSET =
|
||||
TESTSET = ${DEVSET}
|
||||
TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
|
||||
TUNESET = OpenSubtitles
|
||||
|
||||
MONOSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
|
||||
|
||||
## 1 = use remaining data from dev/test data for training
|
||||
USE_REST_DEVDATA = 1
|
||||
@ -177,8 +197,8 @@ endif
|
||||
## WORKDIR = directory used for training
|
||||
|
||||
DATADIR = ${WORKHOME}/data
|
||||
WORKDIR = ${WORKHOME}/${LANGSTR}
|
||||
|
||||
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
|
||||
SPMDIR = ${WORKHOME}/SentencePieceModels
|
||||
|
||||
## data sets
|
||||
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
|
||||
@ -187,8 +207,9 @@ TRAIN_TRG = ${TRAIN_BASE}.trg
|
||||
TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz
|
||||
|
||||
## training data in local space
|
||||
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGSTR}/train/${DATASET}.src
|
||||
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGSTR}/train/${DATASET}.trg
|
||||
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
|
||||
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
|
||||
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
|
||||
|
||||
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
|
||||
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg
|
||||
|
@ -99,6 +99,9 @@ clean-data:
|
||||
|
||||
clean-data-source: ${DATA_SRC} ${DATA_TRG}
|
||||
|
||||
## monolingual data sets (for sentence piece models)
|
||||
mono-data: ${LOCAL_MONO_DATA}.${PRE}
|
||||
|
||||
|
||||
## word alignment used for guided alignment
|
||||
|
||||
@ -569,8 +572,8 @@ endif
|
||||
|
||||
|
||||
# %.clean.gz: %.gz
|
||||
# mkdir -p ${TMPDIR}/${LANGSTR}/cleanup
|
||||
# gzip -cd < $< > ${TMPDIR}/${LANGSTR}/cleanup/$(notdir $@).${SRCEXT}
|
||||
# mkdir -p ${TMPDIR}/${LANGPAIRSTR}/cleanup
|
||||
# gzip -cd < $< > ${TMPDIR}/${LANGPAIRSTR}/cleanup/$(notdir $@).${SRCEXT}
|
||||
|
||||
|
||||
########################
|
||||
@ -605,6 +608,22 @@ endif
|
||||
|
||||
|
||||
|
||||
${LOCAL_MONO_DATA}.raw:
|
||||
mkdir -p ${dir $@}
|
||||
rm -f $@
|
||||
-for l in ${LANGS}; do \
|
||||
${MAKE} DATASET=${DATASET} LANGID:=$$l \
|
||||
add-to-local-mono-data; \
|
||||
done
|
||||
|
||||
add-to-local-mono-data:
|
||||
for c in ${MONOSET}; do \
|
||||
if [ -e ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz ]; then \
|
||||
zcat ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz |\
|
||||
python3 mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \
|
||||
fi \
|
||||
done
|
||||
|
||||
##----------------------------------------------
|
||||
## tokenization
|
||||
##----------------------------------------------
|
||||
@ -739,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
||||
|
||||
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
||||
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||
ifeq ($(wildcard ${BPESRCMODEL}),)
|
||||
mkdir -p ${dir $@}
|
||||
@ -757,7 +776,7 @@ else
|
||||
endif
|
||||
|
||||
## no labels on the target language side
|
||||
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
||||
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||
${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
|
||||
ifeq ($(wildcard ${BPETRGMODEL}),)
|
||||
mkdir -p ${dir $@}
|
||||
@ -816,14 +835,19 @@ endif
|
||||
## --> avoid overwriting validation/test data with new segmentation models
|
||||
## if a new data set is used
|
||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
||||
|
||||
## sentence piece model trained on monolingual data
|
||||
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
|
||||
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
|
||||
|
||||
|
||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO}
|
||||
|
||||
|
||||
|
||||
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
||||
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
||||
mkdir -p ${dir $@}
|
||||
@ -847,7 +871,7 @@ else
|
||||
endif
|
||||
|
||||
## no labels on the target language side
|
||||
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
||||
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
|
||||
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
||||
mkdir -p ${dir $@}
|
||||
@ -865,6 +889,35 @@ endif
|
||||
|
||||
|
||||
|
||||
|
||||
## sentence piece model trained on monolingual data
|
||||
|
||||
mono-spm-model: ${SPMMODEL}
|
||||
|
||||
${SPMSRCMONO}:
|
||||
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
|
||||
|
||||
${SPMTRGMONO}:
|
||||
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
|
||||
|
||||
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
|
||||
ifeq ($(wildcard ${SPMMODEL}),)
|
||||
mkdir -p ${dir $@}
|
||||
grep . $< > $<.text
|
||||
${SPM_HOME}/spm_train \
|
||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||
--character_coverage=1.0 --hard_vocab_limit=false
|
||||
mv $@.model $@
|
||||
rm -f $<.text
|
||||
else
|
||||
@echo "$@ already exists!"
|
||||
@echo "WARNING! No new SPM model created!"
|
||||
@echo "WARNING! Delete the file if you want to start from scratch!"
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
||||
@ -920,12 +973,12 @@ endif
|
||||
##----------------------------------------------
|
||||
## get data from local space and compress ...
|
||||
|
||||
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_SRC}
|
||||
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_SRC}
|
||||
mkdir -p ${dir $@}
|
||||
gzip -c < $< > $@
|
||||
|
||||
ifneq (${PRE_SRC},${PRE_TRG})
|
||||
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_TRG}
|
||||
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_TRG}
|
||||
mkdir -p ${dir $@}
|
||||
gzip -c < $< > $@
|
||||
endif
|
||||
|
@ -5,7 +5,7 @@
|
||||
#
|
||||
|
||||
MODELSHOME = ${WORKHOME}/models
|
||||
DIST_PACKAGE = ${MODELSHOME}/${LANGSTR}/${DATASET}.zip
|
||||
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
|
||||
|
||||
|
||||
## minimum BLEU score for models to be accepted as distribution package
|
||||
@ -56,13 +56,13 @@ best_dist:
|
||||
@m=0;\
|
||||
s=''; \
|
||||
echo "------------------------------------------------"; \
|
||||
echo "search best model for ${LANGSTR}"; \
|
||||
echo "search best model for ${LANGPAIRSTR}"; \
|
||||
for d in ${ALT_MODEL_DIR}; do \
|
||||
e=`ls work-$$d/${LANGSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
|
||||
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
|
||||
echo "evaldata = $$e"; \
|
||||
if [ "$$e" != "GNOME" ]; then \
|
||||
if ls work-$$d/${LANGSTR}/$$e*.eval 1> /dev/null 2>&1; then \
|
||||
b=`grep 'BLEU+' work-$$d/${LANGSTR}/$$e*.eval | cut -f3 -d' '`; \
|
||||
if ls work-$$d/${LANGPAIRSTR}/$$e*.eval 1> /dev/null 2>&1; then \
|
||||
b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e*.eval | cut -f3 -d' '`; \
|
||||
if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \
|
||||
echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \
|
||||
m=$$b; \
|
||||
@ -136,10 +136,10 @@ endif
|
||||
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
|
||||
>> ${WORKDIR}/README.md; \
|
||||
fi
|
||||
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
||||
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
||||
if [ -e $(TEST_EVALUATION) ]; then \
|
||||
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
|
||||
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
|
||||
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
|
||||
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
|
||||
echo '' >> ${WORKDIR}/README.md; \
|
||||
echo '## Benchmarks' >> ${WORKDIR}/README.md; \
|
||||
echo '' >> ${WORKDIR}/README.md; \
|
||||
|
@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES}
|
||||
|
||||
ifeq (${shell hostname},dx6-ibs-p2)
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-langid}
|
||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
LOADMODS = echo "nothing to load"
|
||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-langid}
|
||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
|
||||
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
|
||||
# APPLHOME = ${USERAPPL}/tools
|
||||
APPLHOME = /proj/memad/tools
|
||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter
|
||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-langid
|
||||
OPUSHOME = /proj/nlpl/data/OPUS
|
||||
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
# MARIAN = /proj/nlpl/software/marian/1.2.0
|
||||
@ -83,7 +83,7 @@ else
|
||||
CSCPROJECT = project_2001194
|
||||
# CSCPROJECT = project_2000309
|
||||
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-langid}
|
||||
APPLHOME = ${HOME}/projappl
|
||||
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
|
||||
OPUSHOME = /projappl/nlpl/data/OPUS
|
||||
|
@ -131,6 +131,16 @@ endif
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm=}
|
||||
|
||||
## with SPM models trained on monolingual data
|
||||
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
|
||||
SPMSRCMODEL=${SPMSRCMONO} \
|
||||
SPMTRGMODEL=${SPMTRGMONO} \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-monospm=}
|
||||
|
||||
|
||||
%-spm-noalign:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
|
||||
MODELTYPE=transformer \
|
||||
@ -141,12 +151,21 @@ endif
|
||||
|
||||
|
||||
## sentence-piece models with langid-filtering (new default)
|
||||
%-filter:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \
|
||||
%-langid:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
|
||||
PRE=simple \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm=}
|
||||
${@:-langid=}
|
||||
|
||||
## sentence-piece models with langid-filtering (new default)
|
||||
%-langid-noalign:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
|
||||
MODELTYPE=transformer \
|
||||
PRE=simple \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-langid-noalign=}
|
||||
|
||||
|
||||
|
||||
|
@ -79,19 +79,19 @@ all2pivot:
|
||||
## --> change WORKSPACE, MEM, nr of GPUs, validation frequency, stopping criterion
|
||||
|
||||
train-dynamic:
|
||||
if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \
|
||||
if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
|
||||
${MAKE} data; \
|
||||
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||
if [ $$s -gt 10000000 ]; then \
|
||||
echo "${LANGSTR} bigger than 10 million"; \
|
||||
echo "${LANGPAIRSTR} bigger than 10 million"; \
|
||||
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||
elif [ $$s -gt 1000000 ]; then \
|
||||
echo "${LANGSTR} bigger than 1 million"; \
|
||||
echo "${LANGPAIRSTR} bigger than 1 million"; \
|
||||
${MAKE} \
|
||||
MARIAN_VALID_FREQ=2500 \
|
||||
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
elif [ $$s -gt 100000 ]; then \
|
||||
echo "${LANGSTR} bigger than 100k"; \
|
||||
echo "${LANGPAIRSTR} bigger than 100k"; \
|
||||
${MAKE} \
|
||||
MARIAN_VALID_FREQ=1000 \
|
||||
MARIAN_WORKSPACE=5000 \
|
||||
@ -99,7 +99,7 @@ train-dynamic:
|
||||
MARIAN_EARLY_STOPPING=5 \
|
||||
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
elif [ $$s -gt 10000 ]; then \
|
||||
echo "${LANGSTR} bigger than 10k"; \
|
||||
echo "${LANGPAIRSTR} bigger than 10k"; \
|
||||
${MAKE} \
|
||||
MARIAN_WORKSPACE=3500 \
|
||||
MARIAN_VALID_MINI_BATCH=4 \
|
||||
@ -108,7 +108,7 @@ train-dynamic:
|
||||
MARIAN_EARLY_STOPPING=5 \
|
||||
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
else \
|
||||
echo "${LANGSTR} too small"; \
|
||||
echo "${LANGPAIRSTR} too small"; \
|
||||
fi \
|
||||
fi
|
||||
|
||||
@ -120,18 +120,18 @@ bilingual-dynamic: train-dynamic
|
||||
fi
|
||||
|
||||
# bilingual-dynamic:
|
||||
# if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \
|
||||
# if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
|
||||
# ${MAKE} data; \
|
||||
# s=`zcat ${WORKHOME}/${LANGSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \
|
||||
# s=`zcat ${WORKHOME}/${LANGPAIRSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \
|
||||
# if [ $$s -gt 10000000 ]; then \
|
||||
# echo "${LANGSTR} bigger than 10 million"; \
|
||||
# echo "${LANGPAIRSTR} bigger than 10 million"; \
|
||||
# ${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||
# if [ "${SRCLANGS}" != "${TRGLANGS}" ]; then \
|
||||
# ${MAKE} reverse-data-spm; \
|
||||
# ${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||
# fi; \
|
||||
# elif [ $$s -gt 1000000 ]; then \
|
||||
# echo "${LANGSTR} bigger than 1 million"; \
|
||||
# echo "${LANGPAIRSTR} bigger than 1 million"; \
|
||||
# ${MAKE} \
|
||||
# MARIAN_VALID_FREQ=2500 \
|
||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
@ -142,7 +142,7 @@ bilingual-dynamic: train-dynamic
|
||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
# fi; \
|
||||
# elif [ $$s -gt 100000 ]; then \
|
||||
# echo "${LANGSTR} bigger than 100k"; \
|
||||
# echo "${LANGPAIRSTR} bigger than 100k"; \
|
||||
# ${MAKE} \
|
||||
# MARIAN_VALID_FREQ=1000 \
|
||||
# MARIAN_WORKSPACE=5000 \
|
||||
@ -159,7 +159,7 @@ bilingual-dynamic: train-dynamic
|
||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
# fi; \
|
||||
# elif [ $$s -gt 10000 ]; then \
|
||||
# echo "${LANGSTR} bigger than 10k"; \
|
||||
# echo "${LANGPAIRSTR} bigger than 10k"; \
|
||||
# ${MAKE} \
|
||||
# MARIAN_WORKSPACE=3500 \
|
||||
# MARIAN_VALID_MINI_BATCH=4 \
|
||||
@ -178,7 +178,7 @@ bilingual-dynamic: train-dynamic
|
||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||
# fi; \
|
||||
# else \
|
||||
# echo "${LANGSTR} too small"; \
|
||||
# echo "${LANGPAIRSTR} too small"; \
|
||||
# fi \
|
||||
# fi
|
||||
|
||||
|
@ -11,6 +11,10 @@ parser.add_argument('-s','--srclang','--source-language', type=str, default='en'
|
||||
help='accepted language')
|
||||
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
|
||||
help='accepted language')
|
||||
parser.add_argument('-l','--supported','--supported-languages', action='store_true',
|
||||
help='list all supported languages')
|
||||
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
|
||||
help='show whether languages are supported')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -30,11 +34,28 @@ def is_accepted(line,accept,reject):
|
||||
if isReliable:
|
||||
return True
|
||||
else:
|
||||
if details[0][1] != 'un':
|
||||
if details[0][1] != reject:
|
||||
return True
|
||||
|
||||
|
||||
if args.supported:
|
||||
print(cld2.LANGUAGES)
|
||||
quit()
|
||||
|
||||
|
||||
if args.checklang:
|
||||
if args.srclang:
|
||||
if supported_language(args.srclang):
|
||||
print(args.srclang + " is supported")
|
||||
else:
|
||||
print(args.srclang + " is not supported")
|
||||
if args.trglang:
|
||||
if supported_language(args.trglang):
|
||||
print(args.trglang + " is supported")
|
||||
else:
|
||||
print(args.trglang + " is not supported")
|
||||
quit()
|
||||
|
||||
|
||||
if not supported_language(args.srclang):
|
||||
# print(args.srclang + " is not supported")
|
||||
|
@ -10,6 +10,10 @@ import sys
|
||||
parser = argparse.ArgumentParser(description='language filter')
|
||||
parser.add_argument('-l','--lang','--language', type=str, default='en',
|
||||
help='accepted language')
|
||||
parser.add_argument('-s','--supported','--supported-languages', action='store_true',
|
||||
help='list all supported languages')
|
||||
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
|
||||
help='show whether languages are supported')
|
||||
args = parser.parse_args()
|
||||
|
||||
def supported_language(lang):
|
||||
@ -38,7 +42,6 @@ def is_accepted(line,accept,reject):
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
else:
|
||||
if details[0][1] != 'un':
|
||||
if details[0][1] != reject:
|
||||
# print("ACCEPT")
|
||||
# print(details)
|
||||
@ -47,13 +50,23 @@ def is_accepted(line,accept,reject):
|
||||
# print("REJECT", file=sys.stderr)
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
# else:
|
||||
# print("REJECT", file=sys.stderr)
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
|
||||
|
||||
|
||||
if args.supported:
|
||||
print(cld2.LANGUAGES)
|
||||
quit()
|
||||
|
||||
|
||||
if args.checklang:
|
||||
if args.lang:
|
||||
if supported_language(args.lang):
|
||||
print(args.lang + " is supported")
|
||||
else:
|
||||
print(args.lang + " is not supported")
|
||||
quit()
|
||||
|
||||
|
||||
if not supported_language(args.lang):
|
||||
# print(args.lang + " is not supported")
|
||||
reject = 'en'
|
||||
|
Loading…
Reference in New Issue
Block a user