removed dependence on moses tools in preprocessing script for released spm packages

This commit is contained in:
Jörg Tiedemann 2020-09-12 14:42:10 +03:00
parent c0cb356417
commit ddafb43d66
12 changed files with 161 additions and 116 deletions

View File

@ -337,9 +337,10 @@ MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
MODEL_VOCABTYPE = yml
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
## latest model with the same pre-processing but any data or modeltype
ifdef CONTINUE_EXISTING

View File

@ -159,12 +159,12 @@ else
endif
ifneq (${words ${TRGLANGS}},1)
PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}-multi-target.sh
PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}-multi-target.sh
else
PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}.sh
PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}.sh
endif
POSTPROCESS_SCRIPT = postprocess-${PREPROCESS_TYPE}.sh
POSTPROCESS_SCRIPT = scripts/postprocess-${PREPROCESS_TYPE}.sh

View File

@ -29,9 +29,11 @@ ifeq ($(wildcard ${MODEL_VOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},)
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
else
ifneq (${MODEL_VOCABTYPE},spm)
mkdir -p ${dir $@}
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
endif
endif
else
@echo "$@ already exists!"
@echo "WARNING! No new vocabulary is created even though the data has changed!"
@ -112,6 +114,7 @@ endif
## NEW: take away dependency on ${MODEL_VOCAB}
## train transformer model with guided alignment

View File

@ -1,37 +0,0 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
#
#
# replace MOSESHOME and SPMENCODE with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
MOSESHOME=${APPLHOME}/mosesdecoder
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
elif [ `hostname -d` == "csc.fi" ]; then
APPLHOME=/proj/memad/tools
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
MOSESHOME=${PWD}/mosesdecoder
SPMENCODE=${PWD}/marian-dev/build/spm_encode
fi
MOSESSCRIPTS=${MOSESHOME}/scripts
TOKENIZER=${MOSESSCRIPTS}/tokenizer
if [ "$4" == "noflags" ]; then
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3
else
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"
fi
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |

View File

@ -1,30 +0,0 @@
#!/bin/bash
#
# USAGE preprocess.sh langid bpecodes < input > output
#
#
# replace MOSESHOME and SPMENCODE with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
MOSESHOME=${APPLHOME}/mosesdecoder
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
elif [ `hostname -d` == "csc.fi" ]; then
APPLHOME=/proj/memad/tools
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
MOSESHOME=${PWD}/mosesdecoder
SPMENCODE=${PWD}/marian-dev/build/spm_encode
fi
MOSESSCRIPTS=${MOSESHOME}/scripts
TOKENIZER=${MOSESSCRIPTS}/tokenizer
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |

View File

@ -1,45 +0,0 @@
#!/usr/bin/env bash
# To use an OpenStack cloud you need to authenticate against the Identity
# service named keystone, which returns a **Token** and **Service Catalog**.
# The catalog contains the endpoints for all services the user/tenant has
# access to - such as Compute, Image Service, Identity, Object Storage, Block
# Storage, and Networking (code-named nova, glance, keystone, swift,
# cinder, and neutron).
#
# *NOTE*: Using the 3 *Identity API* does not necessarily mean any other
# OpenStack API is version 3. For example, your cloud provider may implement
# Image API v1.1, Block Storage API v2, and Compute API v2.0. OS_AUTH_URL is
# only for the Identity API served through keystone.
export OS_AUTH_URL=https://pouta.csc.fi:5001/v3
# With the addition of Keystone we have standardized on the term **project**
# as the entity that owns the resources.
export OS_PROJECT_ID=64cd34c22e5b479a92cd983b9c42202f
export OS_PROJECT_NAME="project_2000661"
export OS_USER_DOMAIN_NAME="Default"
if [ -z "$OS_USER_DOMAIN_NAME" ]; then unset OS_USER_DOMAIN_NAME; fi
# unset v2.0 items in case set
unset OS_TENANT_ID
unset OS_TENANT_NAME
# In addition to the owning entity (tenant), OpenStack stores the entity
# performing the action as the **user**.
# export OS_USERNAME="tiedeman"
export OS_USERNAME=`whoami`
# With Keystone you pass the keystone password.
echo "Please enter your OpenStack Password for project $OS_PROJECT_NAME as user $OS_USERNAME: "
read -sr OS_PASSWORD_INPUT
export OS_PASSWORD=$OS_PASSWORD_INPUT
# If your configuration has multiple regions, we set that information here.
# OS_REGION_NAME is optional and only valid in certain environments.
export OS_REGION_NAME="regionOne"
# Don't leave a blank variable, unset it if it was empty
if [ -z "$OS_REGION_NAME" ]; then unset OS_REGION_NAME; fi
export OS_INTERFACE=public
export OS_IDENTITY_API_VERSION=3

View File

@ -0,0 +1,98 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
#
#
# replace SPMENCODE with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
fi
if [ "$4" == "noflags" ]; then
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3
else
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"
fi

55
scripts/preprocess-spm.sh Executable file
View File

@ -0,0 +1,55 @@
#!/bin/bash
#
# USAGE preprocess.sh langid bpecodes < input > output
#
# replace SPMENCODE with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
fi
## simple pre-processing steps adapted from Moses tools
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2