mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
removed dependence on moses tools in preprocessing script for released spm packages
This commit is contained in:
parent
c0cb356417
commit
ddafb43d66
@ -337,9 +337,10 @@ MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
|
||||
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
|
||||
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
|
||||
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
|
||||
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
MODEL_VOCABTYPE = yml
|
||||
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
|
||||
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
|
||||
|
||||
## latest model with the same pre-processing but any data or modeltype
|
||||
ifdef CONTINUE_EXISTING
|
||||
|
@ -159,12 +159,12 @@ else
|
||||
endif
|
||||
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}-multi-target.sh
|
||||
PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}-multi-target.sh
|
||||
else
|
||||
PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}.sh
|
||||
PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}.sh
|
||||
endif
|
||||
|
||||
POSTPROCESS_SCRIPT = postprocess-${PREPROCESS_TYPE}.sh
|
||||
POSTPROCESS_SCRIPT = scripts/postprocess-${PREPROCESS_TYPE}.sh
|
||||
|
||||
|
||||
|
||||
|
@ -29,9 +29,11 @@ ifeq ($(wildcard ${MODEL_VOCAB}),)
|
||||
ifneq (${MODEL_LATEST_VOCAB},)
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
else
|
||||
ifneq (${MODEL_VOCABTYPE},spm)
|
||||
mkdir -p ${dir $@}
|
||||
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
|
||||
endif
|
||||
endif
|
||||
else
|
||||
@echo "$@ already exists!"
|
||||
@echo "WARNING! No new vocabulary is created even though the data has changed!"
|
||||
@ -112,6 +114,7 @@ endif
|
||||
|
||||
|
||||
|
||||
|
||||
## NEW: take away dependency on ${MODEL_VOCAB}
|
||||
|
||||
## train transformer model with guided alignment
|
||||
|
@ -1,37 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||
#
|
||||
#
|
||||
# replace MOSESHOME and SPMENCODE with your own setup!
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
MOSESHOME=${APPLHOME}/mosesdecoder
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
elif [ `hostname -d` == "csc.fi" ]; then
|
||||
APPLHOME=/proj/memad/tools
|
||||
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
MOSESHOME=${PWD}/mosesdecoder
|
||||
SPMENCODE=${PWD}/marian-dev/build/spm_encode
|
||||
fi
|
||||
|
||||
MOSESSCRIPTS=${MOSESHOME}/scripts
|
||||
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
if [ "$4" == "noflags" ]; then
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3
|
||||
else
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
fi
|
||||
|
||||
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
@ -1,30 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh langid bpecodes < input > output
|
||||
#
|
||||
#
|
||||
# replace MOSESHOME and SPMENCODE with your own setup!
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
MOSESHOME=${APPLHOME}/mosesdecoder
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
elif [ `hostname -d` == "csc.fi" ]; then
|
||||
APPLHOME=/proj/memad/tools
|
||||
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
MOSESHOME=${PWD}/mosesdecoder
|
||||
SPMENCODE=${PWD}/marian-dev/build/spm_encode
|
||||
fi
|
||||
|
||||
MOSESSCRIPTS=${MOSESHOME}/scripts
|
||||
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
||||
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# To use an OpenStack cloud you need to authenticate against the Identity
|
||||
# service named keystone, which returns a **Token** and **Service Catalog**.
|
||||
# The catalog contains the endpoints for all services the user/tenant has
|
||||
# access to - such as Compute, Image Service, Identity, Object Storage, Block
|
||||
# Storage, and Networking (code-named nova, glance, keystone, swift,
|
||||
# cinder, and neutron).
|
||||
#
|
||||
# *NOTE*: Using the 3 *Identity API* does not necessarily mean any other
|
||||
# OpenStack API is version 3. For example, your cloud provider may implement
|
||||
# Image API v1.1, Block Storage API v2, and Compute API v2.0. OS_AUTH_URL is
|
||||
# only for the Identity API served through keystone.
|
||||
export OS_AUTH_URL=https://pouta.csc.fi:5001/v3
|
||||
|
||||
# With the addition of Keystone we have standardized on the term **project**
|
||||
# as the entity that owns the resources.
|
||||
export OS_PROJECT_ID=64cd34c22e5b479a92cd983b9c42202f
|
||||
export OS_PROJECT_NAME="project_2000661"
|
||||
export OS_USER_DOMAIN_NAME="Default"
|
||||
if [ -z "$OS_USER_DOMAIN_NAME" ]; then unset OS_USER_DOMAIN_NAME; fi
|
||||
|
||||
# unset v2.0 items in case set
|
||||
unset OS_TENANT_ID
|
||||
unset OS_TENANT_NAME
|
||||
|
||||
# In addition to the owning entity (tenant), OpenStack stores the entity
|
||||
# performing the action as the **user**.
|
||||
# export OS_USERNAME="tiedeman"
|
||||
export OS_USERNAME=`whoami`
|
||||
|
||||
|
||||
# With Keystone you pass the keystone password.
|
||||
echo "Please enter your OpenStack Password for project $OS_PROJECT_NAME as user $OS_USERNAME: "
|
||||
read -sr OS_PASSWORD_INPUT
|
||||
export OS_PASSWORD=$OS_PASSWORD_INPUT
|
||||
|
||||
# If your configuration has multiple regions, we set that information here.
|
||||
# OS_REGION_NAME is optional and only valid in certain environments.
|
||||
export OS_REGION_NAME="regionOne"
|
||||
# Don't leave a blank variable, unset it if it was empty
|
||||
if [ -z "$OS_REGION_NAME" ]; then unset OS_REGION_NAME; fi
|
||||
|
||||
export OS_INTERFACE=public
|
||||
export OS_IDENTITY_API_VERSION=3
|
98
scripts/preprocess-spm-multi-target.sh
Executable file
98
scripts/preprocess-spm-multi-target.sh
Executable file
@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||
#
|
||||
#
|
||||
# replace SPMENCODE with your own setup!
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||||
fi
|
||||
|
||||
|
||||
if [ "$4" == "noflags" ]; then
|
||||
sed -e 's/,/,/g' \
|
||||
-e 's/。 */. /g' \
|
||||
-e 's/、/,/g' \
|
||||
-e 's/”/"/g' \
|
||||
-e 's/“/"/g' \
|
||||
-e 's/∶/:/g' \
|
||||
-e 's/:/:/g' \
|
||||
-e 's/?/\?/g' \
|
||||
-e 's/《/"/g' \
|
||||
-e 's/》/"/g' \
|
||||
-e 's/)/\)/g' \
|
||||
-e 's/!/\!/g' \
|
||||
-e 's/(/\(/g' \
|
||||
-e 's/;/;/g' \
|
||||
-e 's/1/"/g' \
|
||||
-e 's/」/"/g' \
|
||||
-e 's/「/"/g' \
|
||||
-e 's/0/0/g' \
|
||||
-e 's/3/3/g' \
|
||||
-e 's/2/2/g' \
|
||||
-e 's/5/5/g' \
|
||||
-e 's/6/6/g' \
|
||||
-e 's/9/9/g' \
|
||||
-e 's/7/7/g' \
|
||||
-e 's/8/8/g' \
|
||||
-e 's/4/4/g' \
|
||||
-e 's/. */. /g' \
|
||||
-e 's/~/\~/g' \
|
||||
-e "s/’/\'/g" \
|
||||
-e 's/…/\.\.\./g' \
|
||||
-e 's/━/\-/g' \
|
||||
-e 's/〈/\</g' \
|
||||
-e 's/〉/\>/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3
|
||||
else
|
||||
sed -e 's/,/,/g' \
|
||||
-e 's/。 */. /g' \
|
||||
-e 's/、/,/g' \
|
||||
-e 's/”/"/g' \
|
||||
-e 's/“/"/g' \
|
||||
-e 's/∶/:/g' \
|
||||
-e 's/:/:/g' \
|
||||
-e 's/?/\?/g' \
|
||||
-e 's/《/"/g' \
|
||||
-e 's/》/"/g' \
|
||||
-e 's/)/\)/g' \
|
||||
-e 's/!/\!/g' \
|
||||
-e 's/(/\(/g' \
|
||||
-e 's/;/;/g' \
|
||||
-e 's/1/"/g' \
|
||||
-e 's/」/"/g' \
|
||||
-e 's/「/"/g' \
|
||||
-e 's/0/0/g' \
|
||||
-e 's/3/3/g' \
|
||||
-e 's/2/2/g' \
|
||||
-e 's/5/5/g' \
|
||||
-e 's/6/6/g' \
|
||||
-e 's/9/9/g' \
|
||||
-e 's/7/7/g' \
|
||||
-e 's/8/8/g' \
|
||||
-e 's/4/4/g' \
|
||||
-e 's/. */. /g' \
|
||||
-e 's/~/\~/g' \
|
||||
-e "s/’/\'/g" \
|
||||
-e 's/…/\.\.\./g' \
|
||||
-e 's/━/\-/g' \
|
||||
-e 's/〈/\</g' \
|
||||
-e 's/〉/\>/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
fi
|
||||
|
55
scripts/preprocess-spm.sh
Executable file
55
scripts/preprocess-spm.sh
Executable file
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh langid bpecodes < input > output
|
||||
#
|
||||
# replace SPMENCODE with your own setup!
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||||
fi
|
||||
|
||||
## simple pre-processing steps adapted from Moses tools
|
||||
|
||||
sed -e 's/,/,/g' \
|
||||
-e 's/。 */. /g' \
|
||||
-e 's/、/,/g' \
|
||||
-e 's/”/"/g' \
|
||||
-e 's/“/"/g' \
|
||||
-e 's/∶/:/g' \
|
||||
-e 's/:/:/g' \
|
||||
-e 's/?/\?/g' \
|
||||
-e 's/《/"/g' \
|
||||
-e 's/》/"/g' \
|
||||
-e 's/)/\)/g' \
|
||||
-e 's/!/\!/g' \
|
||||
-e 's/(/\(/g' \
|
||||
-e 's/;/;/g' \
|
||||
-e 's/1/"/g' \
|
||||
-e 's/」/"/g' \
|
||||
-e 's/「/"/g' \
|
||||
-e 's/0/0/g' \
|
||||
-e 's/3/3/g' \
|
||||
-e 's/2/2/g' \
|
||||
-e 's/5/5/g' \
|
||||
-e 's/6/6/g' \
|
||||
-e 's/9/9/g' \
|
||||
-e 's/7/7/g' \
|
||||
-e 's/8/8/g' \
|
||||
-e 's/4/4/g' \
|
||||
-e 's/. */. /g' \
|
||||
-e 's/~/\~/g' \
|
||||
-e "s/’/\'/g" \
|
||||
-e 's/…/\.\.\./g' \
|
||||
-e 's/━/\-/g' \
|
||||
-e 's/〈/\</g' \
|
||||
-e 's/〉/\>/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
Loading…
Reference in New Issue
Block a user