diff --git a/compile.sh b/compile.sh index 10de8c406..45c10325c 100755 --- a/compile.sh +++ b/compile.sh @@ -1,8 +1,8 @@ #!/bin/bash -# this script assumes that all 3rd-party dependencies are installed under ./opt +# if not supplied otherwise, this script assumes that all 3rd-party dependencies are installed under ./opt # you can install all 3rd-party dependencies by running make -f contrib/Makefiles/install-dependencies.gmake set -e -o pipefail -opt=$(pwd)/opt -./bjam --with-irstlm=$opt/irstlm-5.80.08 --with-boost=$opt --with-cmph=$opt --with-xmlrpc-c=$opt --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@ +OPT=${OPT:-$(pwd)/OPT} +./bjam --with-irstlm=$OPT/irstlm-5.80.08 --with-boost=$OPT --with-cmph=$OPT --with-xmlrpc-c=$OPT --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@ diff --git a/contrib/Makefiles/install-dependencies.gmake b/contrib/Makefiles/install-dependencies.gmake index ce29e5172..8262368a8 100644 --- a/contrib/Makefiles/install-dependencies.gmake +++ b/contrib/Makefiles/install-dependencies.gmake @@ -56,12 +56,12 @@ sourceforge = http://downloads.sourceforge.net/project nproc := $(shell getconf _NPROCESSORS_ONLN) sfget = mkdir -p '${TMP}' && cd '${TMP}' && wget -qO- ${URL} | tar xz configure-make-install = cd '$1' && ./configure --prefix='${PREFIX}' -configure-make-install += && make -j$(getconf _NPROCESSORS_ONLN) && make install +configure-make-install += && make -j${nproc} && make install # XMLRPC-C for moses server xmlrpc: URL=$(sourceforge)/xmlrpc-c/Xmlrpc-c%20Super%20Stable/1.33.17/xmlrpc-c-1.33.17.tgz xmlrpc: TMP=$(CWD)/build/xmlrpc -xmlrpc: PREFIX=${XMLRPC_PREFIX} +xmlrpc: override PREFIX=${XMLRPC_PREFIX} xmlrpc: | $(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config) $(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config): $(sfget) @@ -71,7 +71,7 @@ $(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config): # CMPH for CompactPT cmph: URL=$(sourceforge)/cmph/cmph/cmph-2.0.tar.gz cmph: TMP=$(CWD)/build/cmph -cmph: PREFIX=${CMPH_PREFIX} +cmph: override PREFIX=${CMPH_PREFIX} cmph: | $(call safepath,${CMPH_PREFIX}/bin/cmph) $(call safepath,${CMPH_PREFIX}/bin/cmph): $(sfget) @@ -82,20 +82,20 @@ $(call safepath,${CMPH_PREFIX}/bin/cmph): irstlm: URL=$(sourceforge)/irstlm/irstlm/irstlm-5.80/irstlm-5.80.08.tgz irstlm: TMP=$(CWD)/build/irstlm irstlm: VERSION=$(basename $(notdir $(irstlm_url))) -irstlm: PREFIX=${IRSTLM_PREFIX} +irstlm: override PREFIX=${IRSTLM_PREFIX} irstlm: | $(call safepath,$(IRSTLM_PREFIX)/bin/build-lm.sh) $(call safepath,$(IRSTLM_PREFIX)/bin/build-lm.sh): $(sfget) cd $$(find '${TMP}' -name trunk) && ./regenerate-makefiles.sh \ - && ./configure --prefix='${PREFIX}' && make -j${shell getconf _NPROCESSORS_ONLN} && make install -j$(shell getconf _NPROCESSORS_ONLN) + && ./configure --prefix='${PREFIX}' && make -j${nproc} && make install -j${nproc} rm -rf ${TMP} # boost boost: URL=http://sourceforge.net/projects/boost/files/boost/1.59.0/boost_1_59_0.tar.gz/download boost: TMP=$(CWD)/build/boost -boost: PREFIX=${BOOST_PREFIX} +boost: override PREFIX=${BOOST_PREFIX} boost: | $(call safepath,${BOOST_PREFIX}/include/boost) $(call safepath,${BOOST_PREFIX}/include/boost): $(sfget) - cd '${TMP}/boost_1_59_0' && ./bootstrap.sh && ./b2 --prefix=${PREFIX} -j$(shell getconf _NPROCESSORS_ONLN) install + cd '${TMP}/boost_1_59_0' && ./bootstrap.sh && ./b2 --prefix=${PREFIX} -j${nproc} install rm -rf ${TMP} diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index 006d7022e..257166721 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -51,6 +51,12 @@ ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" +# For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) +# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) +# "Farasa: A Fast and Furious Segmenter for Arabic" +input-tokenizer = "$farasa-dir/farasa_moses.sh" + + # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl @@ -389,7 +395,7 @@ alignment-symmetrization-method = grow-diag-final-and # #operation-sequence-model = "yes" #operation-sequence-model-order = 5 -#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'" +#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% '" # # OR if you want to use with SRILM # diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index ef5b81010..6f7beb438 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -51,6 +51,11 @@ ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" +# For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) +# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) +# "Farasa: A Fast and Furious Segmenter for Arabic" +input-tokenizer = "$farasa-dir/farasa_moses.sh" + # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index 9f389710b..6fb77a18a 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -54,6 +54,11 @@ ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" +# For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) +# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) +# "Farasa: A Fast and Furious Segmenter for Arabic" +input-tokenizer = "$farasa-dir/farasa_moses.sh" + # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index c6133784a..ddde6baad 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -54,6 +54,11 @@ ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" +# For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) +# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) +# "Farasa: A Fast and Furious Segmenter for Arabic" +input-tokenizer = "$farasa-dir/farasa_moses.sh" + # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index 0b2975b22..dff4ed10d 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -51,6 +51,11 @@ ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" +# For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) +# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) +# "Farasa: A Fast and Furious Segmenter for Arabic" +input-tokenizer = "$farasa-dir/farasa_moses.sh" + # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index abda2adc2..f4730a80f 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -51,6 +51,11 @@ ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" +# For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) +# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) +# "Farasa: A Fast and Furious Segmenter for Arabic" +input-tokenizer = "$farasa-dir/farasa_moses.sh" + # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index 7d9d61658..f548bf799 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -59,7 +59,7 @@ my $help=0; my $dbg=0; my $jobs=4; my $cache_model=undef; -my $mosescmd="$ENV{MOSESBIN}/moses"; #decoder in use +my $mosescmd="$ENV{MOSESBIN}/moses" if defined $ENV{"MOSESBIN"}; #decoder in use my $inputlist=undef; my $inputfile=undef; my $inputtype=0; @@ -276,7 +276,7 @@ sub getNbestParameters(){ #get parameters for search graph computation (possibly from configuration file) sub getSearchGraphParameters(){ if (!$searchgraphlist){ - open (CFG, "$cfgfile"); + open (CFG, $cfgfile) or die "Can't read '$cfgfile'"; while (chomp($_=)){ if (/^\[output-search-graph\]/ || /^\[osg\]/){ my $tmp; @@ -299,7 +299,7 @@ sub getSearchGraphParameters(){ #get parameters for word graph computation (possibly from configuration file) sub getWordGraphParameters(){ if (!$wordgraphlist){ - open (CFG, "$cfgfile"); + open (CFG, $cfgfile) or die "Can't read '$cfgfile'"; while (chomp($_=)){ if (/^\[output-word-graph\]/ || /^\[owg\]/){ my $tmp; @@ -843,12 +843,14 @@ sub concatenate_nbest(){ #computing the length of each input file my @in=(); - open (IN, "${inputfile}.${splitpfx}${idx}.trans"); + open (IN, "${inputfile}.${splitpfx}${idx}.trans") + or die "Failed to open '${inputfile}.${splitpfx}${idx}.trans'"; @in=; close(IN); $inplength{$idx} = scalar(@in); - open (IN, "${nbestfile}.${splitpfx}${idx}"); + open (IN, "${nbestfile}.${splitpfx}${idx}") + or die "Failed to open '${nbestfile}.${splitpfx}${idx}'"; while (){ my ($code,@extra)=split(/\|\|\|/,$_); $code += $offset; @@ -1078,7 +1080,7 @@ sub safesystem { sub getPwdCmd(){ my $pwdcmd="pwd"; my $a; - chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print $1}'`); + chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print \$1}'`); if ($a && -e $a){ $pwdcmd=$a; } return $pwdcmd; } diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index c282b0600..b419f27fb 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -252,7 +252,7 @@ sub safesystem { sub getPwdCmd(){ my $pwdcmd="pwd"; my $a; - chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print $1}'`); + chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print \$1}'`); if ($a && -e $a){ $pwdcmd=$a; } return $pwdcmd; } diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl index 18dc4aa41..488415f8b 100755 --- a/scripts/training/clone_moses_model.pl +++ b/scripts/training/clone_moses_model.pl @@ -123,7 +123,7 @@ sub clone_file_or_die { my $src = shift; my $tgt = shift; - my $src = resolve($src); # resolve symlinks + $src = resolve($src); # resolve symlinks my $ok = 0; if ($symlink) {