From ddafb43d66a9b1264fb5edaf76240f9ff4877c91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Tiedemann?= Date: Sat, 12 Sep 2020 14:42:10 +0300 Subject: [PATCH] removed dependence on moses tools in preprocessing script for released spm packages --- lib/config.mk | 3 +- lib/dist.mk | 6 +- lib/train.mk | 3 + preprocess-spm-multi-target.sh | 37 ------- preprocess-spm.sh | 30 ------ project_2000661-openrc-backup.sh | 45 --------- .../postprocess-bpe.sh | 0 .../postprocess-spm.sh | 0 .../preprocess-bpe-multi-target.sh | 0 .../preprocess-bpe.sh | 0 scripts/preprocess-spm-multi-target.sh | 98 +++++++++++++++++++ scripts/preprocess-spm.sh | 55 +++++++++++ 12 files changed, 161 insertions(+), 116 deletions(-) delete mode 100755 preprocess-spm-multi-target.sh delete mode 100755 preprocess-spm.sh delete mode 100755 project_2000661-openrc-backup.sh rename postprocess-bpe.sh => scripts/postprocess-bpe.sh (100%) rename postprocess-spm.sh => scripts/postprocess-spm.sh (100%) rename preprocess-bpe-multi-target.sh => scripts/preprocess-bpe-multi-target.sh (100%) rename preprocess-bpe.sh => scripts/preprocess-bpe.sh (100%) create mode 100755 scripts/preprocess-spm-multi-target.sh create mode 100755 scripts/preprocess-spm.sh diff --git a/lib/config.mk b/lib/config.mk index 11b6d3f7..361eb3bd 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -337,9 +337,10 @@ MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz +MODEL_DECODER = ${MODEL_FINAL}.decoder.yml MODEL_VOCABTYPE = yml MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE} -MODEL_DECODER = ${MODEL_FINAL}.decoder.yml + ## latest model with the same pre-processing but any data or modeltype ifdef CONTINUE_EXISTING diff --git a/lib/dist.mk b/lib/dist.mk index 154039ea..52b1a10f 100644 --- a/lib/dist.mk +++ b/lib/dist.mk @@ -159,12 +159,12 @@ else endif ifneq (${words ${TRGLANGS}},1) - PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}-multi-target.sh + PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}-multi-target.sh else - PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}.sh + PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}.sh endif -POSTPROCESS_SCRIPT = postprocess-${PREPROCESS_TYPE}.sh +POSTPROCESS_SCRIPT = scripts/postprocess-${PREPROCESS_TYPE}.sh diff --git a/lib/train.mk b/lib/train.mk index 66a2e17f..7fb26891 100644 --- a/lib/train.mk +++ b/lib/train.mk @@ -29,9 +29,11 @@ ifeq ($(wildcard ${MODEL_VOCAB}),) ifneq (${MODEL_LATEST_VOCAB},) cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} else +ifneq (${MODEL_VOCABTYPE},spm) mkdir -p ${dir $@} ${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@ endif +endif else @echo "$@ already exists!" @echo "WARNING! No new vocabulary is created even though the data has changed!" @@ -112,6 +114,7 @@ endif + ## NEW: take away dependency on ${MODEL_VOCAB} ## train transformer model with guided alignment diff --git a/preprocess-spm-multi-target.sh b/preprocess-spm-multi-target.sh deleted file mode 100755 index 2bc9fa07..00000000 --- a/preprocess-spm-multi-target.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# -# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output -# -# -# replace MOSESHOME and SPMENCODE with your own setup! - -if [ `hostname -d` == "bullx" ]; then - APPLHOME=/projappl/project_2001569 - MOSESHOME=${APPLHOME}/mosesdecoder - SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode -elif [ `hostname -d` == "csc.fi" ]; then - APPLHOME=/proj/memad/tools - MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses - SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode -else - MOSESHOME=${PWD}/mosesdecoder - SPMENCODE=${PWD}/marian-dev/build/spm_encode -fi - -MOSESSCRIPTS=${MOSESHOME}/scripts -TOKENIZER=${MOSESSCRIPTS}/tokenizer - -if [ "$4" == "noflags" ]; then - ${TOKENIZER}/replace-unicode-punctuation.perl | - ${TOKENIZER}/remove-non-printing-char.perl | - sed 's/ */ /g;s/^ *//g;s/ *$//g' | - ${SPMENCODE} --model $3 -else - ${TOKENIZER}/replace-unicode-punctuation.perl | - ${TOKENIZER}/remove-non-printing-char.perl | - sed 's/ */ /g;s/^ *//g;s/ *$//g' | - ${SPMENCODE} --model $3 | - sed "s/^/>>$2<< /" -fi - -# ${TOKENIZER}/normalize-punctuation.perl -l $1 | diff --git a/preprocess-spm.sh b/preprocess-spm.sh deleted file mode 100755 index bf9a3f75..00000000 --- a/preprocess-spm.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# -# USAGE preprocess.sh langid bpecodes < input > output -# -# -# replace MOSESHOME and SPMENCODE with your own setup! - -if [ `hostname -d` == "bullx" ]; then - APPLHOME=/projappl/project_2001569 - MOSESHOME=${APPLHOME}/mosesdecoder - SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode -elif [ `hostname -d` == "csc.fi" ]; then - APPLHOME=/proj/memad/tools - MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses - SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode -else - MOSESHOME=${PWD}/mosesdecoder - SPMENCODE=${PWD}/marian-dev/build/spm_encode -fi - -MOSESSCRIPTS=${MOSESHOME}/scripts -TOKENIZER=${MOSESSCRIPTS}/tokenizer - - -${TOKENIZER}/replace-unicode-punctuation.perl | -${TOKENIZER}/remove-non-printing-char.perl | -sed 's/ */ /g;s/^ *//g;s/ *$//g' | -${SPMENCODE} --model $2 - -# ${TOKENIZER}/normalize-punctuation.perl -l $1 | diff --git a/project_2000661-openrc-backup.sh b/project_2000661-openrc-backup.sh deleted file mode 100755 index 2a86dc10..00000000 --- a/project_2000661-openrc-backup.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# To use an OpenStack cloud you need to authenticate against the Identity -# service named keystone, which returns a **Token** and **Service Catalog**. -# The catalog contains the endpoints for all services the user/tenant has -# access to - such as Compute, Image Service, Identity, Object Storage, Block -# Storage, and Networking (code-named nova, glance, keystone, swift, -# cinder, and neutron). -# -# *NOTE*: Using the 3 *Identity API* does not necessarily mean any other -# OpenStack API is version 3. For example, your cloud provider may implement -# Image API v1.1, Block Storage API v2, and Compute API v2.0. OS_AUTH_URL is -# only for the Identity API served through keystone. -export OS_AUTH_URL=https://pouta.csc.fi:5001/v3 - -# With the addition of Keystone we have standardized on the term **project** -# as the entity that owns the resources. -export OS_PROJECT_ID=64cd34c22e5b479a92cd983b9c42202f -export OS_PROJECT_NAME="project_2000661" -export OS_USER_DOMAIN_NAME="Default" -if [ -z "$OS_USER_DOMAIN_NAME" ]; then unset OS_USER_DOMAIN_NAME; fi - -# unset v2.0 items in case set -unset OS_TENANT_ID -unset OS_TENANT_NAME - -# In addition to the owning entity (tenant), OpenStack stores the entity -# performing the action as the **user**. -# export OS_USERNAME="tiedeman" -export OS_USERNAME=`whoami` - - -# With Keystone you pass the keystone password. -echo "Please enter your OpenStack Password for project $OS_PROJECT_NAME as user $OS_USERNAME: " -read -sr OS_PASSWORD_INPUT -export OS_PASSWORD=$OS_PASSWORD_INPUT - -# If your configuration has multiple regions, we set that information here. -# OS_REGION_NAME is optional and only valid in certain environments. -export OS_REGION_NAME="regionOne" -# Don't leave a blank variable, unset it if it was empty -if [ -z "$OS_REGION_NAME" ]; then unset OS_REGION_NAME; fi - -export OS_INTERFACE=public -export OS_IDENTITY_API_VERSION=3 diff --git a/postprocess-bpe.sh b/scripts/postprocess-bpe.sh similarity index 100% rename from postprocess-bpe.sh rename to scripts/postprocess-bpe.sh diff --git a/postprocess-spm.sh b/scripts/postprocess-spm.sh similarity index 100% rename from postprocess-spm.sh rename to scripts/postprocess-spm.sh diff --git a/preprocess-bpe-multi-target.sh b/scripts/preprocess-bpe-multi-target.sh similarity index 100% rename from preprocess-bpe-multi-target.sh rename to scripts/preprocess-bpe-multi-target.sh diff --git a/preprocess-bpe.sh b/scripts/preprocess-bpe.sh similarity index 100% rename from preprocess-bpe.sh rename to scripts/preprocess-bpe.sh diff --git a/scripts/preprocess-spm-multi-target.sh b/scripts/preprocess-spm-multi-target.sh new file mode 100755 index 00000000..0dba53f4 --- /dev/null +++ b/scripts/preprocess-spm-multi-target.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output +# +# +# replace SPMENCODE with your own setup! + +if [ `hostname -d` == "bullx" ]; then + APPLHOME=/projappl/project_2001569 + SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode +else + SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"` +fi + + +if [ "$4" == "noflags" ]; then + sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | + perl -C -pe 's/\p{C}/ /g;' | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + ${SPMENCODE} --model $3 +else + sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | + perl -C -pe 's/\p{C}/ /g;' | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + ${SPMENCODE} --model $3 | + sed "s/^/>>$2<< /" +fi + diff --git a/scripts/preprocess-spm.sh b/scripts/preprocess-spm.sh new file mode 100755 index 00000000..60cb73b3 --- /dev/null +++ b/scripts/preprocess-spm.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# +# USAGE preprocess.sh langid bpecodes < input > output +# +# replace SPMENCODE with your own setup! + +if [ `hostname -d` == "bullx" ]; then + APPLHOME=/projappl/project_2001569 + SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode +else + SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"` +fi + +## simple pre-processing steps adapted from Moses tools + +sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | +perl -C -pe 's/\p{C}/ /g;' | +sed 's/ */ /g;s/^ *//g;s/ *$//g' | +${SPMENCODE} --model $2 +