From 811815064b3dde11a90d24087861a58a983b0462 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Sat, 8 Feb 2020 15:21:37 +0200 Subject: [PATCH] new mode: SentencePieceModels trained on monolingual data --- Makefile.config | 39 +++++++++++++++++------ Makefile.data | 73 ++++++++++++++++++++++++++++++++++++++------ Makefile.dist | 16 +++++----- Makefile.env | 8 ++--- Makefile.generic | 25 +++++++++++++-- Makefile.tasks | 26 ++++++++-------- bitext-match-lang.py | 27 ++++++++++++++-- mono-match-lang.py | 31 +++++++++++++------ 8 files changed, 186 insertions(+), 59 deletions(-) diff --git a/Makefile.config b/Makefile.config index c216249c..8475fecb 100644 --- a/Makefile.config +++ b/Makefile.config @@ -18,11 +18,26 @@ ifndef TRG endif + # sorted languages and langpair used to match resources in OPUS -SORTLANGS = $(sort ${SRC} ${TRG}) -SPACE = $(empty) $(empty) -LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}} -LANGSTR = ${subst ${SPACE},+,$(SRCLANGS)}-${subst ${SPACE},+,$(TRGLANGS)} +SORTLANGS = $(sort ${SRC} ${TRG}) +SPACE = $(empty) $(empty) +LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}} +LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)} +LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)} +LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR} + + +## for monolingual things +ifndef LANGS + LANGS := ${SRCLANGS} +endif +ifndef LANGID + LANGID := ${firstword ${LANGS}} +endif +ifndef LANGSTR + LANGSTR = ${subst ${SPACE},+,$(LANGS)} +endif ## for same language pairs: add numeric extension @@ -43,6 +58,11 @@ OPUSCORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\ ${patsubst ${OPUSHOME}/%,%,\ ${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}} +## monolingual data +OPUSMONOCORPORA = ${patsubst %/latest/mono/${LANGID}.txt.gz,%,\ + ${patsubst ${OPUSHOME}/%,%,\ + ${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}} + ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old} ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'} @@ -119,7 +139,7 @@ EXTRA_TRAINSET = TESTSET = ${DEVSET} TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET}) TUNESET = OpenSubtitles - +MONOSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET}) ## 1 = use remaining data from dev/test data for training USE_REST_DEVDATA = 1 @@ -177,8 +197,8 @@ endif ## WORKDIR = directory used for training DATADIR = ${WORKHOME}/data -WORKDIR = ${WORKHOME}/${LANGSTR} - +WORKDIR = ${WORKHOME}/${LANGPAIRSTR} +SPMDIR = ${WORKHOME}/SentencePieceModels ## data sets TRAIN_BASE = ${WORKDIR}/train/${DATASET} @@ -187,8 +207,9 @@ TRAIN_TRG = ${TRAIN_BASE}.trg TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz ## training data in local space -LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGSTR}/train/${DATASET}.src -LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGSTR}/train/${DATASET}.trg +LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src +LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg +LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg diff --git a/Makefile.data b/Makefile.data index dc46044d..d59b5322 100644 --- a/Makefile.data +++ b/Makefile.data @@ -99,6 +99,9 @@ clean-data: clean-data-source: ${DATA_SRC} ${DATA_TRG} +## monolingual data sets (for sentence piece models) +mono-data: ${LOCAL_MONO_DATA}.${PRE} + ## word alignment used for guided alignment @@ -569,8 +572,8 @@ endif # %.clean.gz: %.gz -# mkdir -p ${TMPDIR}/${LANGSTR}/cleanup -# gzip -cd < $< > ${TMPDIR}/${LANGSTR}/cleanup/$(notdir $@).${SRCEXT} +# mkdir -p ${TMPDIR}/${LANGPAIRSTR}/cleanup +# gzip -cd < $< > ${TMPDIR}/${LANGPAIRSTR}/cleanup/$(notdir $@).${SRCEXT} ######################## @@ -605,6 +608,22 @@ endif +${LOCAL_MONO_DATA}.raw: + mkdir -p ${dir $@} + rm -f $@ + -for l in ${LANGS}; do \ + ${MAKE} DATASET=${DATASET} LANGID:=$$l \ + add-to-local-mono-data; \ + done + +add-to-local-mono-data: + for c in ${MONOSET}; do \ + if [ -e ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz ]; then \ + zcat ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz |\ + python3 mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \ + fi \ + done + ##---------------------------------------------- ## tokenization ##---------------------------------------------- @@ -739,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model .PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL} .INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} -# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/% +# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% ${BPESRCMODEL}: ${LOCAL_TRAIN_SRC} ifeq ($(wildcard ${BPESRCMODEL}),) mkdir -p ${dir $@} @@ -757,7 +776,7 @@ else endif ## no labels on the target language side -# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/% +# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% ${BPETRGMODEL}: ${LOCAL_TRAIN_TRG} ifeq ($(wildcard ${BPETRGMODEL}),) mkdir -p ${dir $@} @@ -816,14 +835,19 @@ endif ## --> avoid overwriting validation/test data with new segmentation models ## if a new data set is used SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model -SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${SRCBPESIZE:000=}k-model +SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model + +## sentence piece model trained on monolingual data +SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model +SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model +SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model -.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} +.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} -# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/% +# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% ${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC} ifeq ($(wildcard ${SPMSRCMODEL}),) mkdir -p ${dir $@} @@ -847,7 +871,7 @@ else endif ## no labels on the target language side -# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/% +# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% ${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG} ifeq ($(wildcard ${SPMTRGMODEL}),) mkdir -p ${dir $@} @@ -865,6 +889,35 @@ endif + +## sentence piece model trained on monolingual data + +mono-spm-model: ${SPMMODEL} + +${SPMSRCMONO}: + ${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model + +${SPMTRGMONO}: + ${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model + +${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} +ifeq ($(wildcard ${SPMMODEL}),) + mkdir -p ${dir $@} + grep . $< > $<.text + ${SPM_HOME}/spm_train \ + --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ + --character_coverage=1.0 --hard_vocab_limit=false + mv $@.model $@ + rm -f $<.text +else + @echo "$@ already exists!" + @echo "WARNING! No new SPM model created!" + @echo "WARNING! Delete the file if you want to start from scratch!" +endif + + + + %.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL} ifeq ($(TRGLANGS),${firstword ${TRGLANGS}}) ${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@ @@ -920,12 +973,12 @@ endif ##---------------------------------------------- ## get data from local space and compress ... -${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_SRC} +${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_SRC} mkdir -p ${dir $@} gzip -c < $< > $@ ifneq (${PRE_SRC},${PRE_TRG}) -${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_TRG} +${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_TRG} mkdir -p ${dir $@} gzip -c < $< > $@ endif diff --git a/Makefile.dist b/Makefile.dist index 32202008..443d57d1 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -5,7 +5,7 @@ # MODELSHOME = ${WORKHOME}/models -DIST_PACKAGE = ${MODELSHOME}/${LANGSTR}/${DATASET}.zip +DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip ## minimum BLEU score for models to be accepted as distribution package @@ -56,13 +56,13 @@ best_dist: @m=0;\ s=''; \ echo "------------------------------------------------"; \ - echo "search best model for ${LANGSTR}"; \ + echo "search best model for ${LANGPAIRSTR}"; \ for d in ${ALT_MODEL_DIR}; do \ - e=`ls work-$$d/${LANGSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \ + e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \ echo "evaldata = $$e"; \ if [ "$$e" != "GNOME" ]; then \ - if ls work-$$d/${LANGSTR}/$$e*.eval 1> /dev/null 2>&1; then \ - b=`grep 'BLEU+' work-$$d/${LANGSTR}/$$e*.eval | cut -f3 -d' '`; \ + if ls work-$$d/${LANGPAIRSTR}/$$e*.eval 1> /dev/null 2>&1; then \ + b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e*.eval | cut -f3 -d' '`; \ if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \ echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \ m=$$b; \ @@ -136,10 +136,10 @@ endif echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \ >> ${WORKDIR}/README.md; \ fi - @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md + @echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md if [ -e $(TEST_EVALUATION) ]; then \ - echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \ - echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \ + echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \ + echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \ echo '' >> ${WORKDIR}/README.md; \ echo '## Benchmarks' >> ${WORKDIR}/README.md; \ echo '' >> ${WORKDIR}/README.md; \ diff --git a/Makefile.env b/Makefile.env index 12c295cd..6bbfb622 100644 --- a/Makefile.env +++ b/Makefile.env @@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES} ifeq (${shell hostname},dx6-ibs-p2) APPLHOME = /opt/tools - WORKHOME = ${shell realpath ${PWD}/work-filter} + WORKHOME = ${shell realpath ${PWD}/work-langid} OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/ MOSESHOME = ${APPLHOME}/mosesdecoder MARIAN = ${APPLHOME}/marian/build LOADMODS = echo "nothing to load" else ifeq (${shell hostname},dx7-nkiel-4gpu) APPLHOME = /opt/tools - WORKHOME = ${shell realpath ${PWD}/work-filter} + WORKHOME = ${shell realpath ${PWD}/work-langid} OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/ MOSESHOME = ${APPLHOME}/mosesdecoder MARIAN = ${APPLHOME}/marian/build @@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),) DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR} # APPLHOME = ${USERAPPL}/tools APPLHOME = /proj/memad/tools - WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter + WORKHOME = /wrk/tiedeman/research/Opus-MT/work-langid OPUSHOME = /proj/nlpl/data/OPUS MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses # MARIAN = /proj/nlpl/software/marian/1.2.0 @@ -83,7 +83,7 @@ else CSCPROJECT = project_2001194 # CSCPROJECT = project_2000309 DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR} - WORKHOME = ${shell realpath ${PWD}/work-filter} + WORKHOME = ${shell realpath ${PWD}/work-langid} APPLHOME = ${HOME}/projappl # OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS OPUSHOME = /projappl/nlpl/data/OPUS diff --git a/Makefile.generic b/Makefile.generic index e0557da1..a12ff266 100644 --- a/Makefile.generic +++ b/Makefile.generic @@ -131,6 +131,16 @@ endif PRE_TRG=spm${TRGBPESIZE:000=}k \ ${@:-spm=} +## with SPM models trained on monolingual data +%-monospm: ${SPMSRCMONO} ${SPMTRGMONO} + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \ + SPMSRCMODEL=${SPMSRCMONO} \ + SPMTRGMODEL=${SPMTRGMONO} \ + PRE_SRC=spm${SRCBPESIZE:000=}k \ + PRE_TRG=spm${TRGBPESIZE:000=}k \ + ${@:-monospm=} + + %-spm-noalign: ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \ MODELTYPE=transformer \ @@ -141,12 +151,21 @@ endif ## sentence-piece models with langid-filtering (new default) -%-filter: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \ +%-langid: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \ PRE=simple \ PRE_SRC=spm${SRCBPESIZE:000=}k \ PRE_TRG=spm${TRGBPESIZE:000=}k \ - ${@:-spm=} + ${@:-langid=} + +## sentence-piece models with langid-filtering (new default) +%-langid-noalign: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \ + MODELTYPE=transformer \ + PRE=simple \ + PRE_SRC=spm${SRCBPESIZE:000=}k \ + PRE_TRG=spm${TRGBPESIZE:000=}k \ + ${@:-langid-noalign=} diff --git a/Makefile.tasks b/Makefile.tasks index a41366a4..ca106cca 100644 --- a/Makefile.tasks +++ b/Makefile.tasks @@ -79,19 +79,19 @@ all2pivot: ## --> change WORKSPACE, MEM, nr of GPUs, validation frequency, stopping criterion train-dynamic: - if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \ + if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \ ${MAKE} data; \ s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \ if [ $$s -gt 10000000 ]; then \ - echo "${LANGSTR} bigger than 10 million"; \ + echo "${LANGPAIRSTR} bigger than 10 million"; \ ${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \ elif [ $$s -gt 1000000 ]; then \ - echo "${LANGSTR} bigger than 1 million"; \ + echo "${LANGPAIRSTR} bigger than 1 million"; \ ${MAKE} \ MARIAN_VALID_FREQ=2500 \ HPC_CORES=1 HPC_MEM=4g train.submit; \ elif [ $$s -gt 100000 ]; then \ - echo "${LANGSTR} bigger than 100k"; \ + echo "${LANGPAIRSTR} bigger than 100k"; \ ${MAKE} \ MARIAN_VALID_FREQ=1000 \ MARIAN_WORKSPACE=5000 \ @@ -99,7 +99,7 @@ train-dynamic: MARIAN_EARLY_STOPPING=5 \ HPC_CORES=1 HPC_MEM=4g train.submit; \ elif [ $$s -gt 10000 ]; then \ - echo "${LANGSTR} bigger than 10k"; \ + echo "${LANGPAIRSTR} bigger than 10k"; \ ${MAKE} \ MARIAN_WORKSPACE=3500 \ MARIAN_VALID_MINI_BATCH=4 \ @@ -108,7 +108,7 @@ train-dynamic: MARIAN_EARLY_STOPPING=5 \ HPC_CORES=1 HPC_MEM=4g train.submit; \ else \ - echo "${LANGSTR} too small"; \ + echo "${LANGPAIRSTR} too small"; \ fi \ fi @@ -120,18 +120,18 @@ bilingual-dynamic: train-dynamic fi # bilingual-dynamic: -# if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \ +# if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \ # ${MAKE} data; \ -# s=`zcat ${WORKHOME}/${LANGSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \ +# s=`zcat ${WORKHOME}/${LANGPAIRSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \ # if [ $$s -gt 10000000 ]; then \ -# echo "${LANGSTR} bigger than 10 million"; \ +# echo "${LANGPAIRSTR} bigger than 10 million"; \ # ${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \ # if [ "${SRCLANGS}" != "${TRGLANGS}" ]; then \ # ${MAKE} reverse-data-spm; \ # ${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \ # fi; \ # elif [ $$s -gt 1000000 ]; then \ -# echo "${LANGSTR} bigger than 1 million"; \ +# echo "${LANGPAIRSTR} bigger than 1 million"; \ # ${MAKE} \ # MARIAN_VALID_FREQ=2500 \ # HPC_CORES=1 HPC_MEM=4g train.submit; \ @@ -142,7 +142,7 @@ bilingual-dynamic: train-dynamic # HPC_CORES=1 HPC_MEM=4g train.submit; \ # fi; \ # elif [ $$s -gt 100000 ]; then \ -# echo "${LANGSTR} bigger than 100k"; \ +# echo "${LANGPAIRSTR} bigger than 100k"; \ # ${MAKE} \ # MARIAN_VALID_FREQ=1000 \ # MARIAN_WORKSPACE=5000 \ @@ -159,7 +159,7 @@ bilingual-dynamic: train-dynamic # HPC_CORES=1 HPC_MEM=4g train.submit; \ # fi; \ # elif [ $$s -gt 10000 ]; then \ -# echo "${LANGSTR} bigger than 10k"; \ +# echo "${LANGPAIRSTR} bigger than 10k"; \ # ${MAKE} \ # MARIAN_WORKSPACE=3500 \ # MARIAN_VALID_MINI_BATCH=4 \ @@ -178,7 +178,7 @@ bilingual-dynamic: train-dynamic # HPC_CORES=1 HPC_MEM=4g train.submit; \ # fi; \ # else \ -# echo "${LANGSTR} too small"; \ +# echo "${LANGPAIRSTR} too small"; \ # fi \ # fi diff --git a/bitext-match-lang.py b/bitext-match-lang.py index 64cec7ed..3ac215eb 100755 --- a/bitext-match-lang.py +++ b/bitext-match-lang.py @@ -11,6 +11,10 @@ parser.add_argument('-s','--srclang','--source-language', type=str, default='en' help='accepted language') parser.add_argument('-t','--trglang','--target-language', type=str, default='de', help='accepted language') +parser.add_argument('-l','--supported','--supported-languages', action='store_true', + help='list all supported languages') +parser.add_argument('-c','--checklang','--check-language-support', action='store_true', + help='show whether languages are supported') args = parser.parse_args() @@ -30,11 +34,28 @@ def is_accepted(line,accept,reject): if isReliable: return True else: - if details[0][1] != 'un': - if details[0][1] != reject: - return True + if details[0][1] != reject: + return True +if args.supported: + print(cld2.LANGUAGES) + quit() + + +if args.checklang: + if args.srclang: + if supported_language(args.srclang): + print(args.srclang + " is supported") + else: + print(args.srclang + " is not supported") + if args.trglang: + if supported_language(args.trglang): + print(args.trglang + " is supported") + else: + print(args.trglang + " is not supported") + quit() + if not supported_language(args.srclang): # print(args.srclang + " is not supported") diff --git a/mono-match-lang.py b/mono-match-lang.py index a1694b92..8f47dca7 100755 --- a/mono-match-lang.py +++ b/mono-match-lang.py @@ -10,6 +10,10 @@ import sys parser = argparse.ArgumentParser(description='language filter') parser.add_argument('-l','--lang','--language', type=str, default='en', help='accepted language') +parser.add_argument('-s','--supported','--supported-languages', action='store_true', + help='list all supported languages') +parser.add_argument('-c','--checklang','--check-language-support', action='store_true', + help='show whether languages are supported') args = parser.parse_args() def supported_language(lang): @@ -38,15 +42,10 @@ def is_accepted(line,accept,reject): # print(details, file=sys.stderr) # print(line, file=sys.stderr) else: - if details[0][1] != 'un': - if details[0][1] != reject: - # print("ACCEPT") - # print(details) - return True - # else: - # print("REJECT", file=sys.stderr) - # print(details, file=sys.stderr) - # print(line, file=sys.stderr) + if details[0][1] != reject: + # print("ACCEPT") + # print(details) + return True # else: # print("REJECT", file=sys.stderr) # print(details, file=sys.stderr) @@ -54,6 +53,20 @@ def is_accepted(line,accept,reject): +if args.supported: + print(cld2.LANGUAGES) + quit() + + +if args.checklang: + if args.lang: + if supported_language(args.lang): + print(args.lang + " is supported") + else: + print(args.lang + " is not supported") + quit() + + if not supported_language(args.lang): # print(args.lang + " is not supported") reject = 'en'