mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
new mode: SentencePieceModels trained on monolingual data
This commit is contained in:
parent
ee8c27e3db
commit
811815064b
@ -18,11 +18,26 @@ ifndef TRG
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# sorted languages and langpair used to match resources in OPUS
|
# sorted languages and langpair used to match resources in OPUS
|
||||||
SORTLANGS = $(sort ${SRC} ${TRG})
|
SORTLANGS = $(sort ${SRC} ${TRG})
|
||||||
SPACE = $(empty) $(empty)
|
SPACE = $(empty) $(empty)
|
||||||
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
|
||||||
LANGSTR = ${subst ${SPACE},+,$(SRCLANGS)}-${subst ${SPACE},+,$(TRGLANGS)}
|
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
|
||||||
|
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
|
||||||
|
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
|
||||||
|
|
||||||
|
|
||||||
|
## for monolingual things
|
||||||
|
ifndef LANGS
|
||||||
|
LANGS := ${SRCLANGS}
|
||||||
|
endif
|
||||||
|
ifndef LANGID
|
||||||
|
LANGID := ${firstword ${LANGS}}
|
||||||
|
endif
|
||||||
|
ifndef LANGSTR
|
||||||
|
LANGSTR = ${subst ${SPACE},+,$(LANGS)}
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
## for same language pairs: add numeric extension
|
## for same language pairs: add numeric extension
|
||||||
@ -43,6 +58,11 @@ OPUSCORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
|
|||||||
${patsubst ${OPUSHOME}/%,%,\
|
${patsubst ${OPUSHOME}/%,%,\
|
||||||
${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}}
|
${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}}
|
||||||
|
|
||||||
|
## monolingual data
|
||||||
|
OPUSMONOCORPORA = ${patsubst %/latest/mono/${LANGID}.txt.gz,%,\
|
||||||
|
${patsubst ${OPUSHOME}/%,%,\
|
||||||
|
${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}}
|
||||||
|
|
||||||
|
|
||||||
ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
|
ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
|
||||||
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
|
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
|
||||||
@ -119,7 +139,7 @@ EXTRA_TRAINSET =
|
|||||||
TESTSET = ${DEVSET}
|
TESTSET = ${DEVSET}
|
||||||
TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
|
TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
|
||||||
TUNESET = OpenSubtitles
|
TUNESET = OpenSubtitles
|
||||||
|
MONOSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
|
||||||
|
|
||||||
## 1 = use remaining data from dev/test data for training
|
## 1 = use remaining data from dev/test data for training
|
||||||
USE_REST_DEVDATA = 1
|
USE_REST_DEVDATA = 1
|
||||||
@ -177,8 +197,8 @@ endif
|
|||||||
## WORKDIR = directory used for training
|
## WORKDIR = directory used for training
|
||||||
|
|
||||||
DATADIR = ${WORKHOME}/data
|
DATADIR = ${WORKHOME}/data
|
||||||
WORKDIR = ${WORKHOME}/${LANGSTR}
|
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
|
||||||
|
SPMDIR = ${WORKHOME}/SentencePieceModels
|
||||||
|
|
||||||
## data sets
|
## data sets
|
||||||
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
|
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
|
||||||
@ -187,8 +207,9 @@ TRAIN_TRG = ${TRAIN_BASE}.trg
|
|||||||
TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz
|
TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz
|
||||||
|
|
||||||
## training data in local space
|
## training data in local space
|
||||||
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGSTR}/train/${DATASET}.src
|
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
|
||||||
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGSTR}/train/${DATASET}.trg
|
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
|
||||||
|
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
|
||||||
|
|
||||||
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
|
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
|
||||||
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg
|
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg
|
||||||
|
@ -99,6 +99,9 @@ clean-data:
|
|||||||
|
|
||||||
clean-data-source: ${DATA_SRC} ${DATA_TRG}
|
clean-data-source: ${DATA_SRC} ${DATA_TRG}
|
||||||
|
|
||||||
|
## monolingual data sets (for sentence piece models)
|
||||||
|
mono-data: ${LOCAL_MONO_DATA}.${PRE}
|
||||||
|
|
||||||
|
|
||||||
## word alignment used for guided alignment
|
## word alignment used for guided alignment
|
||||||
|
|
||||||
@ -569,8 +572,8 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
# %.clean.gz: %.gz
|
# %.clean.gz: %.gz
|
||||||
# mkdir -p ${TMPDIR}/${LANGSTR}/cleanup
|
# mkdir -p ${TMPDIR}/${LANGPAIRSTR}/cleanup
|
||||||
# gzip -cd < $< > ${TMPDIR}/${LANGSTR}/cleanup/$(notdir $@).${SRCEXT}
|
# gzip -cd < $< > ${TMPDIR}/${LANGPAIRSTR}/cleanup/$(notdir $@).${SRCEXT}
|
||||||
|
|
||||||
|
|
||||||
########################
|
########################
|
||||||
@ -605,6 +608,22 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
${LOCAL_MONO_DATA}.raw:
|
||||||
|
mkdir -p ${dir $@}
|
||||||
|
rm -f $@
|
||||||
|
-for l in ${LANGS}; do \
|
||||||
|
${MAKE} DATASET=${DATASET} LANGID:=$$l \
|
||||||
|
add-to-local-mono-data; \
|
||||||
|
done
|
||||||
|
|
||||||
|
add-to-local-mono-data:
|
||||||
|
for c in ${MONOSET}; do \
|
||||||
|
if [ -e ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz ]; then \
|
||||||
|
zcat ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz |\
|
||||||
|
python3 mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \
|
||||||
|
fi \
|
||||||
|
done
|
||||||
|
|
||||||
##----------------------------------------------
|
##----------------------------------------------
|
||||||
## tokenization
|
## tokenization
|
||||||
##----------------------------------------------
|
##----------------------------------------------
|
||||||
@ -739,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
|
|||||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||||
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
|
||||||
|
|
||||||
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||||
ifeq ($(wildcard ${BPESRCMODEL}),)
|
ifeq ($(wildcard ${BPESRCMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
@ -757,7 +776,7 @@ else
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
## no labels on the target language side
|
## no labels on the target language side
|
||||||
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
|
${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
|
||||||
ifeq ($(wildcard ${BPETRGMODEL}),)
|
ifeq ($(wildcard ${BPETRGMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
@ -816,14 +835,19 @@ endif
|
|||||||
## --> avoid overwriting validation/test data with new segmentation models
|
## --> avoid overwriting validation/test data with new segmentation models
|
||||||
## if a new data set is used
|
## if a new data set is used
|
||||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
||||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${SRCBPESIZE:000=}k-model
|
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
||||||
|
|
||||||
|
## sentence piece model trained on monolingual data
|
||||||
|
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
|
||||||
|
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
|
||||||
|
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
|
||||||
|
|
||||||
|
|
||||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
|
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||||
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
@ -847,7 +871,7 @@ else
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
## no labels on the target language side
|
## no labels on the target language side
|
||||||
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
|
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
|
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
|
||||||
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
@ -865,6 +889,35 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## sentence piece model trained on monolingual data
|
||||||
|
|
||||||
|
mono-spm-model: ${SPMMODEL}
|
||||||
|
|
||||||
|
${SPMSRCMONO}:
|
||||||
|
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
|
||||||
|
|
||||||
|
${SPMTRGMONO}:
|
||||||
|
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
|
||||||
|
|
||||||
|
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
|
||||||
|
ifeq ($(wildcard ${SPMMODEL}),)
|
||||||
|
mkdir -p ${dir $@}
|
||||||
|
grep . $< > $<.text
|
||||||
|
${SPM_HOME}/spm_train \
|
||||||
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
|
--character_coverage=1.0 --hard_vocab_limit=false
|
||||||
|
mv $@.model $@
|
||||||
|
rm -f $<.text
|
||||||
|
else
|
||||||
|
@echo "$@ already exists!"
|
||||||
|
@echo "WARNING! No new SPM model created!"
|
||||||
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
|
||||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||||
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
|
||||||
@ -920,12 +973,12 @@ endif
|
|||||||
##----------------------------------------------
|
##----------------------------------------------
|
||||||
## get data from local space and compress ...
|
## get data from local space and compress ...
|
||||||
|
|
||||||
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_SRC}
|
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_SRC}
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
gzip -c < $< > $@
|
gzip -c < $< > $@
|
||||||
|
|
||||||
ifneq (${PRE_SRC},${PRE_TRG})
|
ifneq (${PRE_SRC},${PRE_TRG})
|
||||||
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_TRG}
|
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_TRG}
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
gzip -c < $< > $@
|
gzip -c < $< > $@
|
||||||
endif
|
endif
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
MODELSHOME = ${WORKHOME}/models
|
MODELSHOME = ${WORKHOME}/models
|
||||||
DIST_PACKAGE = ${MODELSHOME}/${LANGSTR}/${DATASET}.zip
|
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
|
||||||
|
|
||||||
|
|
||||||
## minimum BLEU score for models to be accepted as distribution package
|
## minimum BLEU score for models to be accepted as distribution package
|
||||||
@ -56,13 +56,13 @@ best_dist:
|
|||||||
@m=0;\
|
@m=0;\
|
||||||
s=''; \
|
s=''; \
|
||||||
echo "------------------------------------------------"; \
|
echo "------------------------------------------------"; \
|
||||||
echo "search best model for ${LANGSTR}"; \
|
echo "search best model for ${LANGPAIRSTR}"; \
|
||||||
for d in ${ALT_MODEL_DIR}; do \
|
for d in ${ALT_MODEL_DIR}; do \
|
||||||
e=`ls work-$$d/${LANGSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
|
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
|
||||||
echo "evaldata = $$e"; \
|
echo "evaldata = $$e"; \
|
||||||
if [ "$$e" != "GNOME" ]; then \
|
if [ "$$e" != "GNOME" ]; then \
|
||||||
if ls work-$$d/${LANGSTR}/$$e*.eval 1> /dev/null 2>&1; then \
|
if ls work-$$d/${LANGPAIRSTR}/$$e*.eval 1> /dev/null 2>&1; then \
|
||||||
b=`grep 'BLEU+' work-$$d/${LANGSTR}/$$e*.eval | cut -f3 -d' '`; \
|
b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e*.eval | cut -f3 -d' '`; \
|
||||||
if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \
|
if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \
|
||||||
echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \
|
echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \
|
||||||
m=$$b; \
|
m=$$b; \
|
||||||
@ -136,10 +136,10 @@ endif
|
|||||||
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
|
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
|
||||||
>> ${WORKDIR}/README.md; \
|
>> ${WORKDIR}/README.md; \
|
||||||
fi
|
fi
|
||||||
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
|
||||||
if [ -e $(TEST_EVALUATION) ]; then \
|
if [ -e $(TEST_EVALUATION) ]; then \
|
||||||
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
|
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
|
||||||
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
|
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
|
||||||
echo '' >> ${WORKDIR}/README.md; \
|
echo '' >> ${WORKDIR}/README.md; \
|
||||||
echo '## Benchmarks' >> ${WORKDIR}/README.md; \
|
echo '## Benchmarks' >> ${WORKDIR}/README.md; \
|
||||||
echo '' >> ${WORKDIR}/README.md; \
|
echo '' >> ${WORKDIR}/README.md; \
|
||||||
|
@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES}
|
|||||||
|
|
||||||
ifeq (${shell hostname},dx6-ibs-p2)
|
ifeq (${shell hostname},dx6-ibs-p2)
|
||||||
APPLHOME = /opt/tools
|
APPLHOME = /opt/tools
|
||||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
WORKHOME = ${shell realpath ${PWD}/work-langid}
|
||||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||||
MARIAN = ${APPLHOME}/marian/build
|
MARIAN = ${APPLHOME}/marian/build
|
||||||
LOADMODS = echo "nothing to load"
|
LOADMODS = echo "nothing to load"
|
||||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||||
APPLHOME = /opt/tools
|
APPLHOME = /opt/tools
|
||||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
WORKHOME = ${shell realpath ${PWD}/work-langid}
|
||||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||||
MARIAN = ${APPLHOME}/marian/build
|
MARIAN = ${APPLHOME}/marian/build
|
||||||
@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
|
|||||||
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
|
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
|
||||||
# APPLHOME = ${USERAPPL}/tools
|
# APPLHOME = ${USERAPPL}/tools
|
||||||
APPLHOME = /proj/memad/tools
|
APPLHOME = /proj/memad/tools
|
||||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter
|
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-langid
|
||||||
OPUSHOME = /proj/nlpl/data/OPUS
|
OPUSHOME = /proj/nlpl/data/OPUS
|
||||||
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||||
# MARIAN = /proj/nlpl/software/marian/1.2.0
|
# MARIAN = /proj/nlpl/software/marian/1.2.0
|
||||||
@ -83,7 +83,7 @@ else
|
|||||||
CSCPROJECT = project_2001194
|
CSCPROJECT = project_2001194
|
||||||
# CSCPROJECT = project_2000309
|
# CSCPROJECT = project_2000309
|
||||||
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
|
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
|
||||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
WORKHOME = ${shell realpath ${PWD}/work-langid}
|
||||||
APPLHOME = ${HOME}/projappl
|
APPLHOME = ${HOME}/projappl
|
||||||
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
|
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
|
||||||
OPUSHOME = /projappl/nlpl/data/OPUS
|
OPUSHOME = /projappl/nlpl/data/OPUS
|
||||||
|
@ -131,6 +131,16 @@ endif
|
|||||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
${@:-spm=}
|
${@:-spm=}
|
||||||
|
|
||||||
|
## with SPM models trained on monolingual data
|
||||||
|
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
|
||||||
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
|
||||||
|
SPMSRCMODEL=${SPMSRCMONO} \
|
||||||
|
SPMTRGMODEL=${SPMTRGMONO} \
|
||||||
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||||
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
|
${@:-monospm=}
|
||||||
|
|
||||||
|
|
||||||
%-spm-noalign:
|
%-spm-noalign:
|
||||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
|
||||||
MODELTYPE=transformer \
|
MODELTYPE=transformer \
|
||||||
@ -141,12 +151,21 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
## sentence-piece models with langid-filtering (new default)
|
## sentence-piece models with langid-filtering (new default)
|
||||||
%-filter:
|
%-langid:
|
||||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
|
||||||
PRE=simple \
|
PRE=simple \
|
||||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
${@:-spm=}
|
${@:-langid=}
|
||||||
|
|
||||||
|
## sentence-piece models with langid-filtering (new default)
|
||||||
|
%-langid-noalign:
|
||||||
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
|
||||||
|
MODELTYPE=transformer \
|
||||||
|
PRE=simple \
|
||||||
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||||
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
|
${@:-langid-noalign=}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -79,19 +79,19 @@ all2pivot:
|
|||||||
## --> change WORKSPACE, MEM, nr of GPUs, validation frequency, stopping criterion
|
## --> change WORKSPACE, MEM, nr of GPUs, validation frequency, stopping criterion
|
||||||
|
|
||||||
train-dynamic:
|
train-dynamic:
|
||||||
if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \
|
if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
|
||||||
${MAKE} data; \
|
${MAKE} data; \
|
||||||
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
||||||
if [ $$s -gt 10000000 ]; then \
|
if [ $$s -gt 10000000 ]; then \
|
||||||
echo "${LANGSTR} bigger than 10 million"; \
|
echo "${LANGPAIRSTR} bigger than 10 million"; \
|
||||||
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||||
elif [ $$s -gt 1000000 ]; then \
|
elif [ $$s -gt 1000000 ]; then \
|
||||||
echo "${LANGSTR} bigger than 1 million"; \
|
echo "${LANGPAIRSTR} bigger than 1 million"; \
|
||||||
${MAKE} \
|
${MAKE} \
|
||||||
MARIAN_VALID_FREQ=2500 \
|
MARIAN_VALID_FREQ=2500 \
|
||||||
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
elif [ $$s -gt 100000 ]; then \
|
elif [ $$s -gt 100000 ]; then \
|
||||||
echo "${LANGSTR} bigger than 100k"; \
|
echo "${LANGPAIRSTR} bigger than 100k"; \
|
||||||
${MAKE} \
|
${MAKE} \
|
||||||
MARIAN_VALID_FREQ=1000 \
|
MARIAN_VALID_FREQ=1000 \
|
||||||
MARIAN_WORKSPACE=5000 \
|
MARIAN_WORKSPACE=5000 \
|
||||||
@ -99,7 +99,7 @@ train-dynamic:
|
|||||||
MARIAN_EARLY_STOPPING=5 \
|
MARIAN_EARLY_STOPPING=5 \
|
||||||
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
elif [ $$s -gt 10000 ]; then \
|
elif [ $$s -gt 10000 ]; then \
|
||||||
echo "${LANGSTR} bigger than 10k"; \
|
echo "${LANGPAIRSTR} bigger than 10k"; \
|
||||||
${MAKE} \
|
${MAKE} \
|
||||||
MARIAN_WORKSPACE=3500 \
|
MARIAN_WORKSPACE=3500 \
|
||||||
MARIAN_VALID_MINI_BATCH=4 \
|
MARIAN_VALID_MINI_BATCH=4 \
|
||||||
@ -108,7 +108,7 @@ train-dynamic:
|
|||||||
MARIAN_EARLY_STOPPING=5 \
|
MARIAN_EARLY_STOPPING=5 \
|
||||||
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
else \
|
else \
|
||||||
echo "${LANGSTR} too small"; \
|
echo "${LANGPAIRSTR} too small"; \
|
||||||
fi \
|
fi \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -120,18 +120,18 @@ bilingual-dynamic: train-dynamic
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# bilingual-dynamic:
|
# bilingual-dynamic:
|
||||||
# if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \
|
# if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
|
||||||
# ${MAKE} data; \
|
# ${MAKE} data; \
|
||||||
# s=`zcat ${WORKHOME}/${LANGSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \
|
# s=`zcat ${WORKHOME}/${LANGPAIRSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \
|
||||||
# if [ $$s -gt 10000000 ]; then \
|
# if [ $$s -gt 10000000 ]; then \
|
||||||
# echo "${LANGSTR} bigger than 10 million"; \
|
# echo "${LANGPAIRSTR} bigger than 10 million"; \
|
||||||
# ${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
# ${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||||
# if [ "${SRCLANGS}" != "${TRGLANGS}" ]; then \
|
# if [ "${SRCLANGS}" != "${TRGLANGS}" ]; then \
|
||||||
# ${MAKE} reverse-data-spm; \
|
# ${MAKE} reverse-data-spm; \
|
||||||
# ${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
# ${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
|
||||||
# fi; \
|
# fi; \
|
||||||
# elif [ $$s -gt 1000000 ]; then \
|
# elif [ $$s -gt 1000000 ]; then \
|
||||||
# echo "${LANGSTR} bigger than 1 million"; \
|
# echo "${LANGPAIRSTR} bigger than 1 million"; \
|
||||||
# ${MAKE} \
|
# ${MAKE} \
|
||||||
# MARIAN_VALID_FREQ=2500 \
|
# MARIAN_VALID_FREQ=2500 \
|
||||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
@ -142,7 +142,7 @@ bilingual-dynamic: train-dynamic
|
|||||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
# fi; \
|
# fi; \
|
||||||
# elif [ $$s -gt 100000 ]; then \
|
# elif [ $$s -gt 100000 ]; then \
|
||||||
# echo "${LANGSTR} bigger than 100k"; \
|
# echo "${LANGPAIRSTR} bigger than 100k"; \
|
||||||
# ${MAKE} \
|
# ${MAKE} \
|
||||||
# MARIAN_VALID_FREQ=1000 \
|
# MARIAN_VALID_FREQ=1000 \
|
||||||
# MARIAN_WORKSPACE=5000 \
|
# MARIAN_WORKSPACE=5000 \
|
||||||
@ -159,7 +159,7 @@ bilingual-dynamic: train-dynamic
|
|||||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
# fi; \
|
# fi; \
|
||||||
# elif [ $$s -gt 10000 ]; then \
|
# elif [ $$s -gt 10000 ]; then \
|
||||||
# echo "${LANGSTR} bigger than 10k"; \
|
# echo "${LANGPAIRSTR} bigger than 10k"; \
|
||||||
# ${MAKE} \
|
# ${MAKE} \
|
||||||
# MARIAN_WORKSPACE=3500 \
|
# MARIAN_WORKSPACE=3500 \
|
||||||
# MARIAN_VALID_MINI_BATCH=4 \
|
# MARIAN_VALID_MINI_BATCH=4 \
|
||||||
@ -178,7 +178,7 @@ bilingual-dynamic: train-dynamic
|
|||||||
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
# HPC_CORES=1 HPC_MEM=4g train.submit; \
|
||||||
# fi; \
|
# fi; \
|
||||||
# else \
|
# else \
|
||||||
# echo "${LANGSTR} too small"; \
|
# echo "${LANGPAIRSTR} too small"; \
|
||||||
# fi \
|
# fi \
|
||||||
# fi
|
# fi
|
||||||
|
|
||||||
|
@ -11,6 +11,10 @@ parser.add_argument('-s','--srclang','--source-language', type=str, default='en'
|
|||||||
help='accepted language')
|
help='accepted language')
|
||||||
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
|
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
|
||||||
help='accepted language')
|
help='accepted language')
|
||||||
|
parser.add_argument('-l','--supported','--supported-languages', action='store_true',
|
||||||
|
help='list all supported languages')
|
||||||
|
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
|
||||||
|
help='show whether languages are supported')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -30,11 +34,28 @@ def is_accepted(line,accept,reject):
|
|||||||
if isReliable:
|
if isReliable:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
if details[0][1] != 'un':
|
if details[0][1] != reject:
|
||||||
if details[0][1] != reject:
|
return True
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
|
if args.supported:
|
||||||
|
print(cld2.LANGUAGES)
|
||||||
|
quit()
|
||||||
|
|
||||||
|
|
||||||
|
if args.checklang:
|
||||||
|
if args.srclang:
|
||||||
|
if supported_language(args.srclang):
|
||||||
|
print(args.srclang + " is supported")
|
||||||
|
else:
|
||||||
|
print(args.srclang + " is not supported")
|
||||||
|
if args.trglang:
|
||||||
|
if supported_language(args.trglang):
|
||||||
|
print(args.trglang + " is supported")
|
||||||
|
else:
|
||||||
|
print(args.trglang + " is not supported")
|
||||||
|
quit()
|
||||||
|
|
||||||
|
|
||||||
if not supported_language(args.srclang):
|
if not supported_language(args.srclang):
|
||||||
# print(args.srclang + " is not supported")
|
# print(args.srclang + " is not supported")
|
||||||
|
@ -10,6 +10,10 @@ import sys
|
|||||||
parser = argparse.ArgumentParser(description='language filter')
|
parser = argparse.ArgumentParser(description='language filter')
|
||||||
parser.add_argument('-l','--lang','--language', type=str, default='en',
|
parser.add_argument('-l','--lang','--language', type=str, default='en',
|
||||||
help='accepted language')
|
help='accepted language')
|
||||||
|
parser.add_argument('-s','--supported','--supported-languages', action='store_true',
|
||||||
|
help='list all supported languages')
|
||||||
|
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
|
||||||
|
help='show whether languages are supported')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
def supported_language(lang):
|
def supported_language(lang):
|
||||||
@ -38,15 +42,10 @@ def is_accepted(line,accept,reject):
|
|||||||
# print(details, file=sys.stderr)
|
# print(details, file=sys.stderr)
|
||||||
# print(line, file=sys.stderr)
|
# print(line, file=sys.stderr)
|
||||||
else:
|
else:
|
||||||
if details[0][1] != 'un':
|
if details[0][1] != reject:
|
||||||
if details[0][1] != reject:
|
# print("ACCEPT")
|
||||||
# print("ACCEPT")
|
# print(details)
|
||||||
# print(details)
|
return True
|
||||||
return True
|
|
||||||
# else:
|
|
||||||
# print("REJECT", file=sys.stderr)
|
|
||||||
# print(details, file=sys.stderr)
|
|
||||||
# print(line, file=sys.stderr)
|
|
||||||
# else:
|
# else:
|
||||||
# print("REJECT", file=sys.stderr)
|
# print("REJECT", file=sys.stderr)
|
||||||
# print(details, file=sys.stderr)
|
# print(details, file=sys.stderr)
|
||||||
@ -54,6 +53,20 @@ def is_accepted(line,accept,reject):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if args.supported:
|
||||||
|
print(cld2.LANGUAGES)
|
||||||
|
quit()
|
||||||
|
|
||||||
|
|
||||||
|
if args.checklang:
|
||||||
|
if args.lang:
|
||||||
|
if supported_language(args.lang):
|
||||||
|
print(args.lang + " is supported")
|
||||||
|
else:
|
||||||
|
print(args.lang + " is not supported")
|
||||||
|
quit()
|
||||||
|
|
||||||
|
|
||||||
if not supported_language(args.lang):
|
if not supported_language(args.lang):
|
||||||
# print(args.lang + " is not supported")
|
# print(args.lang + " is not supported")
|
||||||
reject = 'en'
|
reject = 'en'
|
||||||
|
Loading…
Reference in New Issue
Block a user