removed unnecessary pre-processing for chinese

This commit is contained in:
Joerg Tiedemann 2020-06-19 16:12:06 +03:00
parent b7f45e2a74
commit 844f8bf72a
3 changed files with 46 additions and 37 deletions

View File

@ -406,6 +406,7 @@ endif
## make some data size-specific configuration parameters
## TODO: is it OK to delete LOCAL_TRAIN data?
.PHONY: local-config
local-config: ${WORKDIR}/config.mk
SMALLEST_TRAINSIZE = 10000

View File

@ -222,6 +222,9 @@ endif
${notdir ${MODEL_VALIDLOG}} \
${notdir ${MODEL_TRAINLOG}} \
source.* target.* decoder.yml preprocess.sh postprocess.sh
@if [ -e ${WORKDIR}/config.mk ]; then \
cd ${WORKDIR} && zip -u ${notdir $@} config.mk; \
fi
@mkdir -p ${dir $@}
@mv -f ${WORKDIR}/${notdir $@} ${@:.zip=}-${DATE}.zip
if [ -e $(TEST_EVALUATION) ]; then \

View File

@ -492,43 +492,6 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
touch ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}
touch ${dir $@}Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}
#######################################
# special treatment for Chinese
# - simplified vs traditional script
#
# TODO: should not manipulate test data like this!!!!
# ---> do Chinese script detectiont properl in data releases!
#######################################
ifeq ($(filter cjy cmn gan lzh nan wuu yue zho,${SRC}),${SRC})
@echo "treating source language Chinese"
for d in dev test train; do \
cat ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} | \
${SCRIPTDIR}/detect_chinese_script.pl > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.script; \
cut -f1 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.langid; \
paste -d '' ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.script \
> ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id; \
cut -f2 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id \
> ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.script; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
done
endif
ifeq ($(filter cjy cmn gan lzh nan wuu yue zho,${TRG}),${TRG})
@echo "treating target language Chinese"
for d in dev test train; do \
cat ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} | \
${SCRIPTDIR}/detect_chinese_script.pl > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.script; \
cut -f2 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.langid; \
paste -d '' ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.script \
> ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
cut -f1 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id; \
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id \
> ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.script; \
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
done
endif
#######################################
# labels in the data
# TODO: should we take all in all data sets?
# NOW: only look for the ones in test data
@ -608,6 +571,48 @@ endif
done
# #######################################
# # special treatment for Chinese
# # - simplified vs traditional script
# #
# # TODO: should not manipulate test data like this!!!!
# # ---> do Chinese script detectiont properl in data releases!
# #######################################
# ifeq ($(filter cjy cmn gan lzh nan wuu yue zho,${SRC}),${SRC})
# @echo "treating source language Chinese"
# for d in dev test train; do \
# cat ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} | \
# ${SCRIPTDIR}/detect_chinese_script.pl > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.script; \
# cut -f1 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.langid; \
# paste -d '' ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.script \
# > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id; \
# cut -f2 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
# paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id \
# > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.script; \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
# done
# endif
# ifeq ($(filter cjy cmn gan lzh nan wuu yue zho,${TRG}),${TRG})
# @echo "treating target language Chinese"
# for d in dev test train; do \
# cat ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} | \
# ${SCRIPTDIR}/detect_chinese_script.pl > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.script; \
# cut -f2 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.langid; \
# paste -d '' ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.script \
# > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
# cut -f1 ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id; \
# paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id \
# > ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id; \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.langid ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.script; \
# rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.id ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.id; \
# done
# endif
%/Tatoeba-train.${LANGPAIR}.clean.${TRGEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz
echo "done!"