new mode: SentencePieceModels trained on monolingual data

This commit is contained in:
Joerg Tiedemann 2020-02-08 15:21:37 +02:00
parent ee8c27e3db
commit 811815064b
8 changed files with 186 additions and 59 deletions

View File

@ -18,11 +18,26 @@ ifndef TRG
endif
# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
SPACE = $(empty) $(empty)
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
LANGSTR = ${subst ${SPACE},+,$(SRCLANGS)}-${subst ${SPACE},+,$(TRGLANGS)}
SORTLANGS = $(sort ${SRC} ${TRG})
SPACE = $(empty) $(empty)
LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
LANGSRCSTR = ${subst ${SPACE},+,$(SRCLANGS)}
LANGTRGSTR = ${subst ${SPACE},+,$(TRGLANGS)}
LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}
## for monolingual things
ifndef LANGS
LANGS := ${SRCLANGS}
endif
ifndef LANGID
LANGID := ${firstword ${LANGS}}
endif
ifndef LANGSTR
LANGSTR = ${subst ${SPACE},+,$(LANGS)}
endif
## for same language pairs: add numeric extension
@ -43,6 +58,11 @@ OPUSCORPORA = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
${patsubst ${OPUSHOME}/%,%,\
${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}}
## monolingual data
OPUSMONOCORPORA = ${patsubst %/latest/mono/${LANGID}.txt.gz,%,\
${patsubst ${OPUSHOME}/%,%,\
${shell ls ${OPUSHOME}/*/latest/mono/${LANGID}.txt.gz}}}
ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
@ -119,7 +139,7 @@ EXTRA_TRAINSET =
TESTSET = ${DEVSET}
TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TUNESET = OpenSubtitles
MONOSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
## 1 = use remaining data from dev/test data for training
USE_REST_DEVDATA = 1
@ -177,8 +197,8 @@ endif
## WORKDIR = directory used for training
DATADIR = ${WORKHOME}/data
WORKDIR = ${WORKHOME}/${LANGSTR}
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
SPMDIR = ${WORKHOME}/SentencePieceModels
## data sets
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
@ -187,8 +207,9 @@ TRAIN_TRG = ${TRAIN_BASE}.trg
TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz
## training data in local space
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGSTR}/train/${DATASET}.trg
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
TUNE_SRC = ${WORKDIR}/tune/${TUNESET}.src
TUNE_TRG = ${WORKDIR}/tune/${TUNESET}.trg

View File

@ -99,6 +99,9 @@ clean-data:
clean-data-source: ${DATA_SRC} ${DATA_TRG}
## monolingual data sets (for sentence piece models)
mono-data: ${LOCAL_MONO_DATA}.${PRE}
## word alignment used for guided alignment
@ -569,8 +572,8 @@ endif
# %.clean.gz: %.gz
# mkdir -p ${TMPDIR}/${LANGSTR}/cleanup
# gzip -cd < $< > ${TMPDIR}/${LANGSTR}/cleanup/$(notdir $@).${SRCEXT}
# mkdir -p ${TMPDIR}/${LANGPAIRSTR}/cleanup
# gzip -cd < $< > ${TMPDIR}/${LANGPAIRSTR}/cleanup/$(notdir $@).${SRCEXT}
########################
@ -605,6 +608,22 @@ endif
${LOCAL_MONO_DATA}.raw:
mkdir -p ${dir $@}
rm -f $@
-for l in ${LANGS}; do \
${MAKE} DATASET=${DATASET} LANGID:=$$l \
add-to-local-mono-data; \
done
add-to-local-mono-data:
for c in ${MONOSET}; do \
if [ -e ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz ]; then \
zcat ${OPUSHOME}/$$c/latest/mono/${LANGID}.txt.gz |\
python3 mono-match-lang.py -l ${LANGID} >> ${LOCAL_MONO_DATA}.raw; \
fi \
done
##----------------------------------------------
## tokenization
##----------------------------------------------
@ -739,7 +758,7 @@ BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
.INTERMEDIATE: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
# ${BPESRCMODEL}: ${WORKDIR}/%.bpe${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${BPESRCMODEL}: ${LOCAL_TRAIN_SRC}
ifeq ($(wildcard ${BPESRCMODEL}),)
mkdir -p ${dir $@}
@ -757,7 +776,7 @@ else
endif
## no labels on the target language side
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
# ${BPETRGMODEL}: ${WORKDIR}/%.bpe${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${BPETRGMODEL}: ${LOCAL_TRAIN_TRG}
ifeq ($(wildcard ${BPETRGMODEL}),)
mkdir -p ${dir $@}
@ -816,14 +835,19 @@ endif
## --> avoid overwriting validation/test data with new segmentation models
## if a new data set is used
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
## sentence piece model trained on monolingual data
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO}
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
ifeq ($(wildcard ${SPMSRCMODEL}),)
mkdir -p ${dir $@}
@ -847,7 +871,7 @@ else
endif
## no labels on the target language side
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGSTR}/%
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
ifeq ($(wildcard ${SPMTRGMODEL}),)
mkdir -p ${dir $@}
@ -865,6 +889,35 @@ endif
## sentence piece model trained on monolingual data
mono-spm-model: ${SPMMODEL}
${SPMSRCMONO}:
${MAKE} LANGS=${SRCLANGS} BPESIZE=${SRCBPESIZE} mono-spm-model
${SPMTRGMONO}:
${MAKE} LANGS=${TRGLANGS} BPESIZE=${TRGBPESIZE} mono-spm-model
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
ifeq ($(wildcard ${SPMMODEL}),)
mkdir -p ${dir $@}
grep . $< > $<.text
${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=1.0 --hard_vocab_limit=false
mv $@.model $@
rm -f $<.text
else
@echo "$@ already exists!"
@echo "WARNING! No new SPM model created!"
@echo "WARNING! Delete the file if you want to start from scratch!"
endif
%.src.spm${SRCBPESIZE:000=}k: %.src ${SPMSRCMODEL}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
${SPM_HOME}/spm_encode --model $(word 2,$^) < $< > $@
@ -920,12 +973,12 @@ endif
##----------------------------------------------
## get data from local space and compress ...
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_SRC}
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_SRC}
mkdir -p ${dir $@}
gzip -c < $< > $@
ifneq (${PRE_SRC},${PRE_TRG})
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGSTR}/%.clean.${PRE_TRG}
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPDIR}/${LANGPAIRSTR}/%.clean.${PRE_TRG}
mkdir -p ${dir $@}
gzip -c < $< > $@
endif

View File

@ -5,7 +5,7 @@
#
MODELSHOME = ${WORKHOME}/models
DIST_PACKAGE = ${MODELSHOME}/${LANGSTR}/${DATASET}.zip
DIST_PACKAGE = ${MODELSHOME}/${LANGPAIRSTR}/${DATASET}.zip
## minimum BLEU score for models to be accepted as distribution package
@ -56,13 +56,13 @@ best_dist:
@m=0;\
s=''; \
echo "------------------------------------------------"; \
echo "search best model for ${LANGSTR}"; \
echo "search best model for ${LANGPAIRSTR}"; \
for d in ${ALT_MODEL_DIR}; do \
e=`ls work-$$d/${LANGSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
echo "evaldata = $$e"; \
if [ "$$e" != "GNOME" ]; then \
if ls work-$$d/${LANGSTR}/$$e*.eval 1> /dev/null 2>&1; then \
b=`grep 'BLEU+' work-$$d/${LANGSTR}/$$e*.eval | cut -f3 -d' '`; \
if ls work-$$d/${LANGPAIRSTR}/$$e*.eval 1> /dev/null 2>&1; then \
b=`grep 'BLEU+' work-$$d/${LANGPAIRSTR}/$$e*.eval | cut -f3 -d' '`; \
if (( $$(echo "$$m-$$b < 1" |bc -l) )); then \
echo "$$d ($$b) is better or not much worse than $$s ($$m)!"; \
m=$$b; \
@ -136,10 +136,10 @@ endif
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
>> ${WORKDIR}/README.md; \
fi
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
@echo "* download: [$(notdir ${@:.zip=})-${DATE}.zip](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.zip)" >> ${WORKDIR}/README.md
if [ -e $(TEST_EVALUATION) ]; then \
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
echo "* test set translations: [$(notdir ${@:.zip=})-${DATE}.test.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.test.txt)" >> ${WORKDIR}/README.md; \
echo "* test set scores: [$(notdir ${@:.zip=})-${DATE}.eval.txt](${MODELS_URL}/${LANGPAIRSTR}/$(notdir ${@:.zip=})-${DATE}.eval.txt)" >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \
echo '## Benchmarks' >> ${WORKDIR}/README.md; \
echo '' >> ${WORKDIR}/README.md; \

View File

@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES}
ifeq (${shell hostname},dx6-ibs-p2)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work-filter}
WORKHOME = ${shell realpath ${PWD}/work-langid}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
else ifeq (${shell hostname},dx7-nkiel-4gpu)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work-filter}
WORKHOME = ${shell realpath ${PWD}/work-langid}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
# APPLHOME = ${USERAPPL}/tools
APPLHOME = /proj/memad/tools
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-langid
OPUSHOME = /proj/nlpl/data/OPUS
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
# MARIAN = /proj/nlpl/software/marian/1.2.0
@ -83,7 +83,7 @@ else
CSCPROJECT = project_2001194
# CSCPROJECT = project_2000309
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
WORKHOME = ${shell realpath ${PWD}/work-filter}
WORKHOME = ${shell realpath ${PWD}/work-langid}
APPLHOME = ${HOME}/projappl
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
OPUSHOME = /projappl/nlpl/data/OPUS

View File

@ -131,6 +131,16 @@ endif
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=}
## with SPM models trained on monolingual data
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
SPMSRCMODEL=${SPMSRCMONO} \
SPMTRGMODEL=${SPMTRGMONO} \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-monospm=}
%-spm-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
MODELTYPE=transformer \
@ -141,12 +151,21 @@ endif
## sentence-piece models with langid-filtering (new default)
%-filter:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \
%-langid:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
PRE=simple \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=}
${@:-langid=}
## sentence-piece models with langid-filtering (new default)
%-langid-noalign:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
MODELTYPE=transformer \
PRE=simple \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-langid-noalign=}

View File

@ -79,19 +79,19 @@ all2pivot:
## --> change WORKSPACE, MEM, nr of GPUs, validation frequency, stopping criterion
train-dynamic:
if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \
if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
${MAKE} data; \
s=`zcat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
if [ $$s -gt 10000000 ]; then \
echo "${LANGSTR} bigger than 10 million"; \
echo "${LANGPAIRSTR} bigger than 10 million"; \
${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
elif [ $$s -gt 1000000 ]; then \
echo "${LANGSTR} bigger than 1 million"; \
echo "${LANGPAIRSTR} bigger than 1 million"; \
${MAKE} \
MARIAN_VALID_FREQ=2500 \
HPC_CORES=1 HPC_MEM=4g train.submit; \
elif [ $$s -gt 100000 ]; then \
echo "${LANGSTR} bigger than 100k"; \
echo "${LANGPAIRSTR} bigger than 100k"; \
${MAKE} \
MARIAN_VALID_FREQ=1000 \
MARIAN_WORKSPACE=5000 \
@ -99,7 +99,7 @@ train-dynamic:
MARIAN_EARLY_STOPPING=5 \
HPC_CORES=1 HPC_MEM=4g train.submit; \
elif [ $$s -gt 10000 ]; then \
echo "${LANGSTR} bigger than 10k"; \
echo "${LANGPAIRSTR} bigger than 10k"; \
${MAKE} \
MARIAN_WORKSPACE=3500 \
MARIAN_VALID_MINI_BATCH=4 \
@ -108,7 +108,7 @@ train-dynamic:
MARIAN_EARLY_STOPPING=5 \
HPC_CORES=1 HPC_MEM=4g train.submit; \
else \
echo "${LANGSTR} too small"; \
echo "${LANGPAIRSTR} too small"; \
fi \
fi
@ -120,18 +120,18 @@ bilingual-dynamic: train-dynamic
fi
# bilingual-dynamic:
# if [ ! -e "${WORKHOME}/${LANGSTR}/train.submit" ]; then \
# if [ ! -e "${WORKHOME}/${LANGPAIRSTR}/train.submit" ]; then \
# ${MAKE} data; \
# s=`zcat ${WORKHOME}/${LANGSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \
# s=`zcat ${WORKHOME}/${LANGPAIRSTR}/train/*.src.clean.${PRE_SRC}.gz | head -10000001 | wc -l`; \
# if [ $$s -gt 10000000 ]; then \
# echo "${LANGSTR} bigger than 10 million"; \
# echo "${LANGPAIRSTR} bigger than 10 million"; \
# ${MAKE} HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
# if [ "${SRCLANGS}" != "${TRGLANGS}" ]; then \
# ${MAKE} reverse-data-spm; \
# ${MAKE} TRGLANGS="${SRCLANGS}" SRCLANGS='${TRGLANGS}' HPC_CORES=1 HPC_MEM=8g train.submit-multigpu; \
# fi; \
# elif [ $$s -gt 1000000 ]; then \
# echo "${LANGSTR} bigger than 1 million"; \
# echo "${LANGPAIRSTR} bigger than 1 million"; \
# ${MAKE} \
# MARIAN_VALID_FREQ=2500 \
# HPC_CORES=1 HPC_MEM=4g train.submit; \
@ -142,7 +142,7 @@ bilingual-dynamic: train-dynamic
# HPC_CORES=1 HPC_MEM=4g train.submit; \
# fi; \
# elif [ $$s -gt 100000 ]; then \
# echo "${LANGSTR} bigger than 100k"; \
# echo "${LANGPAIRSTR} bigger than 100k"; \
# ${MAKE} \
# MARIAN_VALID_FREQ=1000 \
# MARIAN_WORKSPACE=5000 \
@ -159,7 +159,7 @@ bilingual-dynamic: train-dynamic
# HPC_CORES=1 HPC_MEM=4g train.submit; \
# fi; \
# elif [ $$s -gt 10000 ]; then \
# echo "${LANGSTR} bigger than 10k"; \
# echo "${LANGPAIRSTR} bigger than 10k"; \
# ${MAKE} \
# MARIAN_WORKSPACE=3500 \
# MARIAN_VALID_MINI_BATCH=4 \
@ -178,7 +178,7 @@ bilingual-dynamic: train-dynamic
# HPC_CORES=1 HPC_MEM=4g train.submit; \
# fi; \
# else \
# echo "${LANGSTR} too small"; \
# echo "${LANGPAIRSTR} too small"; \
# fi \
# fi

View File

@ -11,6 +11,10 @@ parser.add_argument('-s','--srclang','--source-language', type=str, default='en'
help='accepted language')
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
help='accepted language')
parser.add_argument('-l','--supported','--supported-languages', action='store_true',
help='list all supported languages')
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
help='show whether languages are supported')
args = parser.parse_args()
@ -30,11 +34,28 @@ def is_accepted(line,accept,reject):
if isReliable:
return True
else:
if details[0][1] != 'un':
if details[0][1] != reject:
return True
if details[0][1] != reject:
return True
if args.supported:
print(cld2.LANGUAGES)
quit()
if args.checklang:
if args.srclang:
if supported_language(args.srclang):
print(args.srclang + " is supported")
else:
print(args.srclang + " is not supported")
if args.trglang:
if supported_language(args.trglang):
print(args.trglang + " is supported")
else:
print(args.trglang + " is not supported")
quit()
if not supported_language(args.srclang):
# print(args.srclang + " is not supported")

View File

@ -10,6 +10,10 @@ import sys
parser = argparse.ArgumentParser(description='language filter')
parser.add_argument('-l','--lang','--language', type=str, default='en',
help='accepted language')
parser.add_argument('-s','--supported','--supported-languages', action='store_true',
help='list all supported languages')
parser.add_argument('-c','--checklang','--check-language-support', action='store_true',
help='show whether languages are supported')
args = parser.parse_args()
def supported_language(lang):
@ -38,15 +42,10 @@ def is_accepted(line,accept,reject):
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
else:
if details[0][1] != 'un':
if details[0][1] != reject:
# print("ACCEPT")
# print(details)
return True
# else:
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
if details[0][1] != reject:
# print("ACCEPT")
# print(details)
return True
# else:
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
@ -54,6 +53,20 @@ def is_accepted(line,accept,reject):
if args.supported:
print(cld2.LANGUAGES)
quit()
if args.checklang:
if args.lang:
if supported_language(args.lang):
print(args.lang + " is supported")
else:
print(args.lang + " is not supported")
quit()
if not supported_language(args.lang):
# print(args.lang + " is not supported")
reject = 'en'