removed dependence on moses tools in preprocessing script for released spm packages

2024-07-14 15:40:33 +03:00 · 2020-09-12 14:42:10 +03:00 · 2020-09-12 14:42:10 +03:00 · ddafb43d66
commit ddafb43d66
parent c0cb356417
12 changed files with 161 additions and 116 deletions
--- a/lib/config.mk
+++ b/lib/config.mk
@ -337,9 +337,10 @@ MODEL_VALIDLOG  = ${MODEL}.${MODELTYPE}.valid${NR}.log
 MODEL_TRAINLOG  = ${MODEL}.${MODELTYPE}.train${NR}.log
 MODEL_START     = ${WORKDIR}/${MODEL_BASENAME}.npz
 MODEL_FINAL     = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
+MODEL_DECODER   = ${MODEL_FINAL}.decoder.yml
 MODEL_VOCABTYPE = yml
 MODEL_VOCAB     = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
-MODEL_DECODER   = ${MODEL_FINAL}.decoder.yml
+

 ## latest model with the same pre-processing but any data or modeltype
 ifdef CONTINUE_EXISTING
--- a/lib/dist.mk
+++ b/lib/dist.mk
@ -159,12 +159,12 @@ else
 endif

 ifneq (${words ${TRGLANGS}},1)
-  PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}-multi-target.sh
+  PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}-multi-target.sh
 else
-  PREPROCESS_SCRIPT = preprocess-${PREPROCESS_TYPE}.sh
+  PREPROCESS_SCRIPT = scripts/preprocess-${PREPROCESS_TYPE}.sh
 endif

-POSTPROCESS_SCRIPT = postprocess-${PREPROCESS_TYPE}.sh
+POSTPROCESS_SCRIPT = scripts/postprocess-${PREPROCESS_TYPE}.sh



--- a/lib/train.mk
+++ b/lib/train.mk
@ -29,9 +29,11 @@ ifeq ($(wildcard ${MODEL_VOCAB}),)
 ifneq (${MODEL_LATEST_VOCAB},)
 	cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
 else
+ifneq (${MODEL_VOCABTYPE},spm)
 	mkdir -p ${dir $@}
 	${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
 endif
+endif
 else
 	@echo "$@ already exists!"
 	@echo "WARNING! No new vocabulary is created even though the data has changed!"
@ -112,6 +114,7 @@ endif



+
 ## NEW: take away dependency on ${MODEL_VOCAB}

 ## train transformer model with guided alignment
--- a/preprocess-spm-multi-target.sh
+++ b/preprocess-spm-multi-target.sh
@ -1,37 +0,0 @@
-#!/bin/bash
-#
-# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
-#
-#
-# replace MOSESHOME and SPMENCODE with your own setup! 
-
-if [ `hostname -d` == "bullx" ]; then
-  APPLHOME=/projappl/project_2001569
-  MOSESHOME=${APPLHOME}/mosesdecoder
-  SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
-elif [ `hostname -d` == "csc.fi" ]; then
-  APPLHOME=/proj/memad/tools
-  MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
-  SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
-else
-  MOSESHOME=${PWD}/mosesdecoder
-  SPMENCODE=${PWD}/marian-dev/build/spm_encode
-fi
-
-MOSESSCRIPTS=${MOSESHOME}/scripts
-TOKENIZER=${MOSESSCRIPTS}/tokenizer
-
-if [ "$4" == "noflags" ]; then
-  ${TOKENIZER}/replace-unicode-punctuation.perl |
-  ${TOKENIZER}/remove-non-printing-char.perl |
-  sed 's/  */ /g;s/^ *//g;s/ *$//g' |
-  ${SPMENCODE} --model $3
-else
-  ${TOKENIZER}/replace-unicode-punctuation.perl |
-  ${TOKENIZER}/remove-non-printing-char.perl |
-  sed 's/  */ /g;s/^ *//g;s/ *$//g' |
-  ${SPMENCODE} --model $3 |
-  sed "s/^/>>$2<< /"
-fi
-
-# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
--- a/preprocess-spm.sh
+++ b/preprocess-spm.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-#
-# USAGE preprocess.sh langid bpecodes < input > output
-#
-#
-# replace MOSESHOME and SPMENCODE with your own setup! 
-
-if [ `hostname -d` == "bullx" ]; then
-  APPLHOME=/projappl/project_2001569
-  MOSESHOME=${APPLHOME}/mosesdecoder
-  SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
-elif [ `hostname -d` == "csc.fi" ]; then
-  APPLHOME=/proj/memad/tools
-  MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
-  SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
-else
-  MOSESHOME=${PWD}/mosesdecoder
-  SPMENCODE=${PWD}/marian-dev/build/spm_encode
-fi
-
-MOSESSCRIPTS=${MOSESHOME}/scripts
-TOKENIZER=${MOSESSCRIPTS}/tokenizer
-
-
-${TOKENIZER}/replace-unicode-punctuation.perl |
-${TOKENIZER}/remove-non-printing-char.perl |
-sed 's/  */ /g;s/^ *//g;s/ *$//g' |
-${SPMENCODE} --model $2
-
-# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
--- a/project_2000661-openrc-backup.sh
+++ b/project_2000661-openrc-backup.sh
@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-# To use an OpenStack cloud you need to authenticate against the Identity
-# service named keystone, which returns a **Token** and **Service Catalog**.
-# The catalog contains the endpoints for all services the user/tenant has
-# access to - such as Compute, Image Service, Identity, Object Storage, Block
-# Storage, and Networking (code-named nova, glance, keystone, swift,
-# cinder, and neutron).
-#
-# *NOTE*: Using the 3 *Identity API* does not necessarily mean any other
-# OpenStack API is version 3. For example, your cloud provider may implement
-# Image API v1.1, Block Storage API v2, and Compute API v2.0. OS_AUTH_URL is
-# only for the Identity API served through keystone.
-export OS_AUTH_URL=https://pouta.csc.fi:5001/v3
-
-# With the addition of Keystone we have standardized on the term **project**
-# as the entity that owns the resources.
-export OS_PROJECT_ID=64cd34c22e5b479a92cd983b9c42202f
-export OS_PROJECT_NAME="project_2000661"
-export OS_USER_DOMAIN_NAME="Default"
-if [ -z "$OS_USER_DOMAIN_NAME" ]; then unset OS_USER_DOMAIN_NAME; fi
-
-# unset v2.0 items in case set
-unset OS_TENANT_ID
-unset OS_TENANT_NAME
-
-# In addition to the owning entity (tenant), OpenStack stores the entity
-# performing the action as the **user**.
-# export OS_USERNAME="tiedeman"
-export OS_USERNAME=`whoami`
-
-
-# With Keystone you pass the keystone password.
-echo "Please enter your OpenStack Password for project $OS_PROJECT_NAME as user $OS_USERNAME: "
-read -sr OS_PASSWORD_INPUT
-export OS_PASSWORD=$OS_PASSWORD_INPUT
-
-# If your configuration has multiple regions, we set that information here.
-# OS_REGION_NAME is optional and only valid in certain environments.
-export OS_REGION_NAME="regionOne"
-# Don't leave a blank variable, unset it if it was empty
-if [ -z "$OS_REGION_NAME" ]; then unset OS_REGION_NAME; fi
-
-export OS_INTERFACE=public
-export OS_IDENTITY_API_VERSION=3
--- a/scripts/postprocess-bpe.sh
+++ b/scripts/postprocess-bpe.sh
--- a/scripts/postprocess-spm.sh
+++ b/scripts/postprocess-spm.sh
--- a/scripts/preprocess-bpe-multi-target.sh
+++ b/scripts/preprocess-bpe-multi-target.sh
--- a/scripts/preprocess-bpe.sh
+++ b/scripts/preprocess-bpe.sh
--- a/scripts/preprocess-spm-multi-target.sh
+++ b/scripts/preprocess-spm-multi-target.sh
@ -0,0 +1,98 @@
+#!/bin/bash
+#
+# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
+#
+#
+# replace SPMENCODE with your own setup! 
+
+if [ `hostname -d` == "bullx" ]; then
+  APPLHOME=/projappl/project_2001569
+  SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
+else
+  SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
+fi
+
+
+if [ "$4" == "noflags" ]; then
+    sed -e 's/，/,/g' \
+	-e 's/。 */. /g' \
+	-e 's/、/,/g' \
+	-e 's/”/"/g' \
+	-e 's/“/"/g' \
+	-e 's/∶/:/g' \
+	-e 's/：/:/g' \
+	-e 's/？/\?/g' \
+	-e 's/《/"/g' \
+	-e 's/》/"/g' \
+	-e 's/）/\)/g' \
+	-e 's/！/\!/g' \
+	-e 's/（/\(/g' \
+	-e 's/；/;/g' \
+	-e 's/１/"/g' \
+	-e 's/」/"/g' \
+	-e 's/「/"/g' \
+	-e 's/０/0/g' \
+	-e 's/３/3/g' \
+	-e 's/２/2/g' \
+	-e 's/５/5/g' \
+	-e 's/６/6/g' \
+	-e 's/９/9/g' \
+	-e 's/７/7/g' \
+	-e 's/８/8/g' \
+	-e 's/４/4/g' \
+	-e 's/． */. /g' \
+	-e 's/～/\~/g' \
+	-e "s/’/\'/g" \
+	-e 's/…/\.\.\./g' \
+	-e 's/━/\-/g' \
+	-e 's/〈/\</g' \
+	-e 's/〉/\>/g' \
+	-e 's/【/\[/g' \
+	-e 's/】/\]/g' \
+	-e 's/％/\%/g' |    
+	perl -C -pe 's/\p{C}/ /g;' |
+	sed 's/  */ /g;s/^ *//g;s/ *$//g' |
+	${SPMENCODE} --model $3
+else
+    sed -e 's/，/,/g' \
+	-e 's/。 */. /g' \
+	-e 's/、/,/g' \
+	-e 's/”/"/g' \
+	-e 's/“/"/g' \
+	-e 's/∶/:/g' \
+	-e 's/：/:/g' \
+	-e 's/？/\?/g' \
+	-e 's/《/"/g' \
+	-e 's/》/"/g' \
+	-e 's/）/\)/g' \
+	-e 's/！/\!/g' \
+	-e 's/（/\(/g' \
+	-e 's/；/;/g' \
+	-e 's/１/"/g' \
+	-e 's/」/"/g' \
+	-e 's/「/"/g' \
+	-e 's/０/0/g' \
+	-e 's/３/3/g' \
+	-e 's/２/2/g' \
+	-e 's/５/5/g' \
+	-e 's/６/6/g' \
+	-e 's/９/9/g' \
+	-e 's/７/7/g' \
+	-e 's/８/8/g' \
+	-e 's/４/4/g' \
+	-e 's/． */. /g' \
+	-e 's/～/\~/g' \
+	-e "s/’/\'/g" \
+	-e 's/…/\.\.\./g' \
+	-e 's/━/\-/g' \
+	-e 's/〈/\</g' \
+	-e 's/〉/\>/g' \
+	-e 's/【/\[/g' \
+	-e 's/】/\]/g' \
+	-e 's/％/\%/g' |    
+	perl -C -pe 's/\p{C}/ /g;' |
+	sed 's/  */ /g;s/^ *//g;s/ *$//g' |
+	${SPMENCODE} --model $3 |
+	sed "s/^/>>$2<< /"
+fi
+
--- a/scripts/preprocess-spm.sh
+++ b/scripts/preprocess-spm.sh
@ -0,0 +1,55 @@
+#!/bin/bash
+#
+# USAGE preprocess.sh langid bpecodes < input > output
+#
+# replace SPMENCODE with your own setup! 
+
+if [ `hostname -d` == "bullx" ]; then
+  APPLHOME=/projappl/project_2001569
+  SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
+else
+  SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
+fi
+
+## simple pre-processing steps adapted from Moses tools
+
+sed -e 's/，/,/g' \
+    -e 's/。 */. /g' \
+    -e 's/、/,/g' \
+    -e 's/”/"/g' \
+    -e 's/“/"/g' \
+    -e 's/∶/:/g' \
+    -e 's/：/:/g' \
+    -e 's/？/\?/g' \
+    -e 's/《/"/g' \
+    -e 's/》/"/g' \
+    -e 's/）/\)/g' \
+    -e 's/！/\!/g' \
+    -e 's/（/\(/g' \
+    -e 's/；/;/g' \
+    -e 's/１/"/g' \
+    -e 's/」/"/g' \
+    -e 's/「/"/g' \
+    -e 's/０/0/g' \
+    -e 's/３/3/g' \
+    -e 's/２/2/g' \
+    -e 's/５/5/g' \
+    -e 's/６/6/g' \
+    -e 's/９/9/g' \
+    -e 's/７/7/g' \
+    -e 's/８/8/g' \
+    -e 's/４/4/g' \
+    -e 's/． */. /g' \
+    -e 's/～/\~/g' \
+    -e "s/’/\'/g" \
+    -e 's/…/\.\.\./g' \
+    -e 's/━/\-/g' \
+    -e 's/〈/\</g' \
+    -e 's/〉/\>/g' \
+    -e 's/【/\[/g' \
+    -e 's/】/\]/g' \
+    -e 's/％/\%/g' |    
+perl -C -pe 's/\p{C}/ /g;' |
+sed 's/  */ /g;s/^ *//g;s/ *$//g' |
+${SPMENCODE} --model $2
+