mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
lang specific cleanup scripts are now possible
This commit is contained in:
parent
0ff0e625d5
commit
f5111a27a7
@ -18,11 +18,11 @@ endif
|
|||||||
## no further arguments are supported
|
## no further arguments are supported
|
||||||
|
|
||||||
ifneq (${wildcard scripts/cleanup/${SRC}},)
|
ifneq (${wildcard scripts/cleanup/${SRC}},)
|
||||||
SRC_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${SRC} -executable -type f}} |
|
SRC_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${SRC} -executable -type f}}
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq (${wildcard scripts/cleanup/${TRG}},)
|
ifneq (${wildcard scripts/cleanup/${TRG}},)
|
||||||
TRG_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${TRG} -executable -type f}} |
|
TRG_CLEANUP_SCRIPTS = | ${subst ${SPACE}, | ,${shell find scripts/cleanup/${TRG} -executable -type f}}
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
@ -721,6 +721,20 @@ add-to-local-mono-data:
|
|||||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
%.${SRCEXT}.norm: %.${SRCEXT}.raw
|
||||||
|
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
%.${TRGEXT}.norm: %.${TRGEXT}.raw
|
||||||
|
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
|
||||||
## minimal pre-processing
|
## minimal pre-processing
|
||||||
%.simple: %.raw
|
%.simple: %.raw
|
||||||
@ -730,6 +744,21 @@ add-to-local-mono-data:
|
|||||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
%.${SRCEXT}.simple: %.${SRCEXT}.raw
|
||||||
|
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
%.${TRGEXT}.simple: %.${TRGEXT}.raw
|
||||||
|
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## remove all spaces (treat everything as a long string)
|
## remove all spaces (treat everything as a long string)
|
||||||
%.nospace: %.raw
|
%.nospace: %.raw
|
||||||
|
Loading…
Reference in New Issue
Block a user