Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-27 22:14:57 +03:00 · 2015-04-03 15:47:22 +01:00 · 2015-04-03 15:47:22 +01:00 · 44c5ae344f
commit 44c5ae344f
parent 217f389230 8d8097632b
152 changed files with 2057 additions and 1515 deletions
--- a/contrib/c++tokenizer/Parameters.cpp
+++ b/contrib/c++tokenizer/Parameters.cpp
@ -5,11 +5,14 @@ namespace TOKENIZER_NAMESPACE {
 #endif

 Parameters::Parameters()
-: cfg_path(0)
+: nthreads(0)
+, chunksize(2000)
+, cfg_path(0)
 , verbose_p(false)
 , detag_p(false)
 , alltag_p(false)
-, escape_p(true)
+, entities_p(false)
+, escape_p(false)
 , aggro_p(false)
 , supersub_p(false)
 , url_p(true)
@ -23,6 +26,10 @@ Parameters::Parameters()
 , refined_p(false)
 , unescape_p(false)
 , drop_bad_p(false)
+, split_p(false)
+, notokenization_p(false)
+, para_marks_p(false)
+, split_breaks_p(false)
 {
 }

--- a/contrib/c++tokenizer/Parameters.h
+++ b/contrib/c++tokenizer/Parameters.h
@ -12,10 +12,13 @@ struct Parameters
    std::string lang_iso;
    std::vector<std::string> args;
    std::string out_path;
+    int nthreads;
+    int chunksize;
    const char *cfg_path;
    bool verbose_p;
    bool detag_p;
    bool alltag_p;
+    bool entities_p;
    bool escape_p;
    bool aggro_p;
    bool supersub_p;
@ -30,6 +33,10 @@ struct Parameters
    bool refined_p;
    bool unescape_p;
    bool drop_bad_p;
+    bool split_p;
+    bool notokenization_p;
+    bool para_marks_p;
+    bool split_breaks_p;

 	Parameters();

--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
--- a/contrib/c++tokenizer/tokenizer.h
+++ b/contrib/c++tokenizer/tokenizer.h
@ -26,12 +26,37 @@ class Tokenizer {

 private:

-    static std::string cfg_dir;
+    typedef enum { 
+        empty = 0,
+        blank,
+        upper, // upper case
+        letta, // extended word class (includes number, hyphen)
+        numba,
+        hyphn,
+        stops, // blank to stops are "extended word class" variants
+        quote, // init & fini = {',"}
+        pinit, // init (includes INVERT_*)
+        pfini, // fini
+        pfpct, // fini + pct
+        marks,
+        limit
+    } charclass_t;

+    std::size_t nthreads;
+    std::size_t chunksize;
+    std::string cfg_dir;
+
+    // non-breaking prefixes (numeric) utf8
    std::set<std::string> nbpre_num_set;
+    // non-breaking prefixes (other) utf8
    std::set<std::string> nbpre_gen_set;
+
+    // non-breaking prefixes (numeric) ucs4
    std::set<std::wstring> nbpre_num_ucs4;
+    // non-breaking prefixes (other) ucs4
    std::set<std::wstring> nbpre_gen_ucs4;
+
+    // compiled protected patterns 
    std::vector<re2::RE2 *> prot_pat_vec;

 protected:
@ -42,6 +67,7 @@ protected:
    bool latin_p; // is lang_iso "fr" or "it"
    bool skip_xml_p;
    bool skip_alltags_p;
+    bool entities_p;
    bool escape_p;
    bool unescape_p;
    bool aggressive_hyphen_p;
@ -54,20 +80,44 @@ protected:
    bool narrow_kana_p;
    bool refined_p;
    bool drop_bad_p;
+    bool splits_p;
    bool verbose_p;
+    bool para_marks_p;
+    bool split_breaks_p;

+    // return counts of general and numeric prefixes loaded
    std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso

-    // escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
-    bool escape(std::string& inplace);
-
    // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
    void protected_tokenize(std::string& inplace);

-public:
+    // used for boost::thread
+    struct VectorTokenizerCallable {
+        Tokenizer *tokenizer;
+        std::vector<std::string>& in;
+        std::vector<std::string>& out;
+        
+        VectorTokenizerCallable(Tokenizer *_tokenizer, 
+                                std::vector<std::string>& _in, 
+                                std::vector<std::string>& _out) 
+        : tokenizer(_tokenizer)
+        , in(_in)
+        , out(_out) {
+        };

-    // cfg_dir is assumed shared by all languages
-    static void set_config_dir(const std::string& _cfg_dir);
+        void operator()() {
+            out.resize(in.size());
+            for (std::size_t ii = 0; ii < in.size(); ++ii) 
+                if (in[ii].empty())
+                    out[ii] = in[ii];
+                else if (tokenizer->penn_p) 
+                    out[ii] = tokenizer->penn_tokenize(in[ii]);
+                else
+                    out[ii] = tokenizer->quik_tokenize(in[ii]);
+        };
+    };
+
+public:

    Tokenizer(); // UNIMPL

@ -78,21 +128,46 @@ public:
    ~Tokenizer();

    // required before other methods, may throw
-    void init();
+    void init(const char *cfg_dir_path = 0);

-    // streaming tokenizer reads from is, writes to os, preserving line breaks
+    void set_config_dir(const std::string& _cfg_dir);
+
+    // required after processing a contiguous sequence of lines when sentence splitting is on
+    void reset();
+
+    // simultaneous sentence splitting not yet implemented
+    bool splitting() const { return splits_p; }
+
+    // escapes chars the set &|"'<> after tokenization (moses special characters)
+    bool escape(std::string& inplace);
+
+    // used in detokenizer, converts entities into characters
+    // if escape_p is set, does not unescape moses special tokens, thus
+    // escape_p and unescape_p can be used together usefully
+    bool unescape(std::string& inplace);
+
+    // streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
    std::size_t tokenize(std::istream& is, std::ostream& os);

-    // tokenize padded line buffer to return string
-    std::string tokenize(const std::string& buf);
+    // quik-tokenize padded line buffer to return string
+    std::string quik_tokenize(const std::string& buf);

+    // penn-tokenize padded line buffer to return string // untested
+    std::string penn_tokenize(const std::string& buf);
+
+    // select-tokenize padded line buffer to return string
+    std::string tokenize(const std::string& buf) {
+        return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
+    }
+
+    // tokenize with output argument
    void tokenize(const std::string& buf, std::string& outs) {
        outs = tokenize(buf);
    }

    // tokenize to a vector
    std::vector<std::string> tokens(const std::string& in) {
-        std::istringstream tokss(tokenize(in));
+        std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
        std::vector<std::string> outv;
        std::copy(std::istream_iterator<std::string>(tokss),
                  std::istream_iterator<std::string>(),
@ -117,6 +192,12 @@ public:
        return detokenize(oss.str());
    }

+    // split a string on sentence boundaries (approximately)
+    std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
+
+    // split sentences from input stream and write one per line on output stream
+    std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
+
 }; // end class Tokenizer

 #ifdef TOKENIZER_NAMESPACE
--- a/contrib/c++tokenizer/tokenizer_main.cpp
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@ -16,10 +16,12 @@ usage(const char *path)
    std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
    std::cerr << " -a -- aggressive hyphenization" << std::endl;
    std::cerr << " -b -- drop bad bytes" << std::endl;
+    std::cerr << " -B -- splitter will split on linebreak" << std::endl;
    std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
    std::cerr << " -d -- downcase" << std::endl;
    std::cerr << " -D -- detokenize" << std::endl;
    std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
+    std::cerr << " -E -- preserve entities during tokenization" << std::endl;
    std::cerr << " -k -- narrow kana" << std::endl;
    std::cerr << " -n -- narrow latin" << std::endl;
    std::cerr << " -N -- normalize" << std::endl;
@ -27,12 +29,16 @@ usage(const char *path)
    std::cerr << " -p -- penn treebank style" << std::endl;
    std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
    std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
+    std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
+    std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
+    std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
    std::cerr << " -u -- disable url handling" << std::endl;
    std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
    std::cerr << " -v -- verbose" << std::endl;
    std::cerr << " -w -- word filter" << std::endl;
    std::cerr << " -x -- skip xml tag lines" << std::endl;
    std::cerr << " -y -- skip all xml tags" << std::endl;
+    std::cerr << " -X -- split only, with <P> marks" << std::endl;
    std::cerr << "Default is -c ., stdin, stdout." << std::endl;
    std::cerr << "LL in en,fr,it affect contraction.  LL selects nonbreaking prefix file" << std::endl;
    std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
@ -83,15 +89,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
    int nlines = 0;
    std::string line;
    while (ifs.good() && std::getline(ifs,line)) {
-        if (line.empty()) continue;
+        if (line.empty()) 
+            continue;
        std::vector<std::string> tokens(tize.tokens(line));
        int count = 0;
+        bool was_break = false;
+
        for (auto& token: tokens) {
+            if (token.empty()) {
+                if (count || was_break) {
+                    ofs << std::endl;
+                    count = 0;
+                    nlines++;
+                    was_break = true;
+                    continue;
+                }
+            }
+            was_break = false;
+
            std::string word(token_word(token));
-            if (word.empty()) continue;
-            ofs << word << ' ';
-            count++;
+            if (word.empty()) {
+                continue;
+            }
+
+            if (count++) {
+                ofs << ' ';
+            }
+            ofs << word;
        }
+
        if (count) {
            ofs << std::endl;
            nlines++;
@ -104,13 +130,16 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
 int main(int ac, char **av) 
 {
    int rc = 0;
-		Parameters params;
+    Parameters params;

    const char *prog = av[0];
    bool next_cfg_p = false;
    bool next_output_p = false;
+    bool next_threads_p = false;
    bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
-    
+    if (!detokenize_p)
+        params.split_p = std::strstr(av[0],"splitter") != 0;
+
    while (++av,--ac) { 
        if (**av == '-') {
            switch (av[0][1]) {
@ -120,6 +149,9 @@ int main(int ac, char **av)
            case 'b':
                params.drop_bad_p = true;
                break;
+            case 'B':
+                params.split_breaks_p = true;
+                break;
            case 'c':
                next_cfg_p = true;
                break;
@ -127,10 +159,13 @@ int main(int ac, char **av)
                params.downcase_p = true;
                break;
            case 'D':
-                detokenize_p = true;
+                detokenize_p = !detokenize_p;
                break;
            case 'e':
-                params.escape_p = false;
+                params.escape_p = !params.escape_p;
+                break;
+            case 'E':
+                params.entities_p = true;
                break;
            case 'h':
                usage(prog);
@ -156,6 +191,16 @@ int main(int ac, char **av)
            case 's':
                params.supersub_p = true;
                break;
+            case 'S':
+                params.split_p = !params.split_p;
+                break;
+            case 'T':
+                params.notokenization_p = true;
+                params.para_marks_p = false;
+                break;
+            case 't':
+                next_threads_p = true;
+                break;
            case 'U':
                params.unescape_p = true;
                break;
@ -171,6 +216,10 @@ int main(int ac, char **av)
            case 'x':
                params.detag_p = true;
                break;
+            case 'X':
+                params.notokenization_p = true;
+                params.para_marks_p = true;
+                break;
            case 'y':
                params.alltag_p = true;
                break;
@ -181,7 +230,7 @@ int main(int ac, char **av)
                std::cerr << "Unknown option: " << *av << std::endl;
                ::exit(1);
            }
-        } else if (params.lang_iso.empty() && strlen(*av) == 2) {
+        } else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
            params.lang_iso = *av;
        } else if (next_output_p) {
            next_output_p = false;
@ -189,6 +238,14 @@ int main(int ac, char **av)
        } else if (next_cfg_p) {
            next_cfg_p = false;
            params.cfg_path = *av;
+        } else if (next_threads_p) {
+            next_threads_p = false;
+            char *comma = strchr(*av,',');
+            if (comma) {
+                *comma++ = 0;
+                params.chunksize = std::strtoul(comma,0,0);
+            } 
+            params.nthreads = std::strtoul(*av,0,0);
        } else {
            params.args.push_back(std::string(*av));
        }
@ -230,7 +287,6 @@ int main(int ac, char **av)
        if (params.verbose_p) {
            std::cerr << "config path: " << params.cfg_path << std::endl;
        }
-        Tokenizer::set_config_dir(std::string(params.cfg_path));
    } 

    std::unique_ptr<std::ofstream> pofs = 0;
@ -244,16 +300,16 @@ int main(int ac, char **av)

    Tokenizer tize(params);
    tize.init();
-    size_t nlines = 0;
+    std::pair<std::size_t,std::size_t> plines = { 0, 0 };

    if (params.words_p) {
        if (params.args.empty()) {
-            nlines += copy_words(tize,std::cin,ofs);
+            plines.first += copy_words(tize,std::cin,ofs);
        } else {
            for (std::string& arg : params.args) {
                try {
                    std::ifstream ifs(arg.c_str());
-                    nlines += copy_words(tize,ifs,ofs);
+                    plines.first += copy_words(tize,ifs,ofs);
                } catch (...) {
                    std::cerr << "Exception extracting words from path " << arg << std::endl;
                }
@ -261,18 +317,22 @@ int main(int ac, char **av)
        }
    } else if (params.args.empty()) {
        if (detokenize_p) {
-            nlines = tize.detokenize(std::cin,ofs);
+            plines.first = tize.detokenize(std::cin,ofs);
+        } else if (params.notokenization_p) {
+            plines = tize.splitter(std::cin,ofs);
        } else {
-            nlines = tize.tokenize(std::cin,ofs);
+            plines.first = tize.tokenize(std::cin,ofs);
        }
    } else {
        for (std::string& arg : params.args) {
            try {
                std::ifstream ifs(arg.c_str());
                if (detokenize_p) {
-                    nlines = tize.detokenize(ifs,ofs);
+                    plines.first = tize.detokenize(ifs,ofs);
+                } else if (params.notokenization_p) {
+                    plines = tize.splitter(ifs,ofs);
                } else {
-                    nlines = tize.tokenize(ifs,ofs);
+                    plines.first = tize.tokenize(ifs,ofs);
                }
            } catch (...) {
                std::cerr << "Exception tokenizing from path " << arg << std::endl;
@ -280,9 +340,12 @@ int main(int ac, char **av)
        }
    }

-    if (params.verbose_p)
-        std::cerr << "%%% " << nlines << " lines." << std::endl;
-    
+    if (params.verbose_p) {
+        std::cerr << "%%% " << plines.first << " lines." << std::endl;
+        if (plines.second) {
+            std::cerr << "%%% " << plines.second << " sentences." << std::endl;
+        }
+    }    
    return rc;
 }

--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
+++ b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
@ -9,6 +9,7 @@
 #include <algorithm>
 #include <fstream>
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/filesystem.hpp>
 #include "EnOpenNLPChunker.h"
 #include "moses/Util.h"

@ -28,10 +29,11 @@ EnOpenNLPChunker::~EnOpenNLPChunker() {

 void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
 {
+        const boost::filesystem::path
+            inPath = boost::filesystem::unique_path(),
+            outPath = boost::filesystem::unique_path();
 	// read all input to a temp file
-	char *ptr = tmpnam(NULL);
-	string inStr(ptr);
-	ofstream inFile(ptr);
+	ofstream inFile(inPath.c_str());

 	string line;
 	while (getline(in, line)) {
@ -40,21 +42,18 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
 	}
 	inFile.close();

-	ptr = tmpnam(NULL);
-	string outStr(ptr);
-
 	// execute chunker
-	string cmd = "cat " + inStr + " | "
+	string cmd = "cat " + inPath.native() + " | "
 			+ m_openNLPPath + "/bin/opennlp POSTagger "
 				+ m_openNLPPath + "/models/en-pos-maxent.bin | "
 			+ m_openNLPPath + "/bin/opennlp ChunkerME "
 				+ m_openNLPPath + "/models/en-chunker.bin > "
-			+ outStr;
+			+ outPath.native();
 	//g << "Executing:" << cmd << endl;
 	int ret = system(cmd.c_str());

 	// read result of chunker and output as Moses xml trees
-	ifstream outFile(outStr.c_str());
+	ifstream outFile(outPath.c_str());

 	size_t lineNum = 0;
 	while (getline(outFile, line)) {
@ -66,8 +65,8 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
 	outFile.close();

 	// clean up temporary files
-	remove(inStr.c_str());
-	remove(outStr.c_str());
+	remove(inPath.c_str());
+	remove(outPath.c_str());
 }

 void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
--- a/moses/FF/VW/VWFeatureBase.h
+++ b/moses/FF/VW/VWFeatureBase.h
@ -3,7 +3,7 @@
 #include <string>
 #include <boost/thread/tss.hpp>

-#include "Classifier.h"
+#include "vw/Classifier.h"
 #include "moses/TypeDef.h"
 #include "moses/Util.h"
 #include "moses/FF/StatelessFeatureFunction.h"
--- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
@ -70,10 +70,9 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
    inputPath.SetTargetPhrases(*this, tpColl, NULL);
  } else {
    // TRANSLITERATE
-    char *ptr = tmpnam(NULL);
-    string inFile(ptr);
-    ptr = tmpnam(NULL);
-    string outDir(ptr);
+    const boost::filesystem::path
+        inFile = boost::filesystem::unique_path(),
+        outDir = boost::filesystem::unique_path();

    ofstream inStream(inFile.c_str());
    inStream << sourcePhrase.ToString() << endl;
@ -85,14 +84,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
                 " --external-bin-dir " + m_externalDir +
                 " --input-extension " + m_inputLang +
                 " --output-extension " + m_outputLang +
-                 " --oov-file " + inFile +
-                 " --out-dir " + outDir;
+                 " --oov-file " + inFile.native() +
+                 " --out-dir " + outDir.native();

    int ret = system(cmd.c_str());
    UTIL_THROW_IF2(ret != 0, "Transliteration script error");

    TargetPhraseCollection *tpColl = new TargetPhraseCollection();
-    vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
+    vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native());
    vector<TargetPhrase*>::const_iterator iter;
    for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
      TargetPhrase *tp = *iter;
--- a/moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp
+++ b/moses/TranslationModel/ProbingPT/tests/tokenization_tests.cpp
@ -1,206 +0,0 @@
-#include "line_splitter.hh"
-
-bool test_vectorinsert()
-{
-  StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
-  StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
-  line_text output = splitLine(line1);
-  line_text output2 = splitLine(line2);
-
-  //Init container vector and iterator.
-  std::vector<char> container;
-  container.reserve(10000); //Reserve vector
-  std::vector<char>::iterator it = container.begin();
-  std::pair<std::vector<char>::iterator, int> binary_append_ret; //Return values from vector_append
-
-  //Put a value into the vector
-  binary_append_ret = vector_append(&output, &container, it, false);
-  it = binary_append_ret.first;
-  binary_append_ret = vector_append(&output2, &container, it, false);
-  it = binary_append_ret.first;
-
-  std::string test(container.begin(), container.end());
-  std::string should_be = "! ! ! ! 0.0804289 0.141656 0.0804289 0.443409 2.718 0-0 1-1 2-2 3-3 1 1 1! ! ! ) - , a 0.0804289 0.0257627 0.0804289 0.00146736 2.718 0-0 1-1 2-2 3-3 4-4 4-5 5-6 1 1 1";
-  if (test == should_be) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool probabilitiesTest()
-{
-  StringPiece line1 = StringPiece("0.536553 0.75961 0.634108 0.532927 2.718");
-  StringPiece line2 = StringPiece("1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718");
-
-  std::vector<double> pesho;
-  bool peshobool = false;
-  bool kirobool = false;
-  std::vector<double> kiro;
-
-  pesho = splitProbabilities(line1);
-  kiro = splitProbabilities(line2);
-
-  if (pesho[0] == 0.536553 && pesho[1] == 0.75961 && pesho[2] == 0.634108 && pesho[3] == 0.532927 && pesho[4] == 2.718 && pesho.size() == 5) {
-    peshobool = true;
-  } else {
-    std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << " " << pesho[3] << " " << pesho[4] << std::endl;
-    std::cout << "Size is: " << pesho.size() << " Expected 5." << std::endl;
-    std::cout << "Expected: " << "0.536553 0.75961 0.634108 0.532927 2.718" << std::endl;
-  }
-
-  if (kiro[0] == 1.42081e-05 && kiro[1] == 3.91895e-09 && kiro[2] == 0.0738539 && kiro[3] == 0.749514 && kiro[4] == 2.718 && kiro.size() == 5) {
-    kirobool = true;
-  } else {
-    std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << " " << kiro[3] << " " << kiro[4] << std::endl;
-    std::cout << "Size is: " << kiro.size() << " Expected 5." << std::endl;
-    std::cout << "Expected: " << "1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718" << std::endl;
-  }
-
-  return (peshobool && kirobool);
-}
-
-bool wordAll1test()
-{
-  StringPiece line1 = StringPiece("2-0 3-1 4-2 5-2");
-  StringPiece line2 = StringPiece("0-0 1-1 2-2 3-3 4-3 6-4 5-5");
-
-  std::vector<int> pesho;
-  bool peshobool = false;
-  bool kirobool = false;
-  std::vector<int> kiro;
-
-  pesho = splitWordAll1(line1);
-  kiro = splitWordAll1(line2);
-
-  if (pesho[0] == 2 && pesho[1] == 0 && pesho[2] == 3 && pesho[3] == 1 && pesho[4] == 4
-      && pesho[5] == 2 && pesho[6] == 5 && pesho[7] == 2 && pesho.size() == 8) {
-    peshobool = true;
-  } else {
-    std::cout << "Processed: " << pesho[0] << "-" << pesho[1] << " " << pesho[2] << "-" << pesho[3] << " "
-              << pesho[4] << "-" << pesho[5] << " " << pesho[6] << "-" << pesho[7] << std::endl;
-    std::cout << "Size is: " << pesho.size() << " Expected: 8." << std::endl;
-    std::cout << "Expected: " << "2-0 3-1 4-2 5-2" << std::endl;
-  }
-
-  if (kiro[0] == 0 && kiro[1] == 0 && kiro[2] == 1 && kiro[3] == 1 && kiro[4] == 2 && kiro[5] == 2
-      && kiro[6] == 3 && kiro[7] == 3 && kiro[8] == 4 && kiro[9] == 3 && kiro[10] == 6 && kiro[11] == 4
-      && kiro[12] == 5 && kiro[13] == 5 && kiro.size() == 14) {
-    kirobool = true;
-  } else {
-    std::cout << "Processed: " << kiro[0] << "-" << kiro[1] << " " << kiro[2] << "-" << kiro[3] << " "
-              << kiro[4] << "-" << kiro[5] << " " << kiro[6] << "-" << kiro[7] << " " << kiro[8] << "-" << kiro[9]
-              << " " << kiro[10] << "-" << kiro[11] << " " << kiro[12] << "-" << kiro[13] << std::endl;
-    std::cout << "Size is: " << kiro.size() << " Expected: 14" << std::endl;
-    std::cout << "Expected: " << "0-0 1-1 2-2 3-3 4-3 6-4 5-5" << std::endl;
-  }
-
-  return (peshobool && kirobool);
-}
-
-bool wordAll2test()
-{
-  StringPiece line1 = StringPiece("4 9 1");
-  StringPiece line2 = StringPiece("3255 9 1");
-
-  std::vector<int> pesho;
-  bool peshobool = false;
-  bool kirobool = false;
-  std::vector<int> kiro;
-
-  pesho = splitWordAll2(line1);
-  kiro = splitWordAll2(line2);
-
-  if (pesho[0] == 4 && pesho[1] == 9 && pesho[2] == 1 && pesho.size() == 3) {
-    peshobool = true;
-  } else {
-    std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << std::endl;
-    std::cout << "Size: " << pesho.size() << " Expected: 3" << std::endl;
-    std::cout << "Expected: " << "4 9 1" << std::endl;
-  }
-
-  if (kiro[0] == 3255 && kiro[1] == 9 && kiro[2] == 1 && kiro.size() == 3) {
-    kirobool = true;
-  } else {
-    std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << std::endl;
-    std::cout << "Size: " << kiro.size() << " Expected: 3" << std::endl;
-    std::cout << "Expected: " << "3255 9 1" << std::endl;
-  }
-
-  return (peshobool && kirobool);
-
-}
-
-bool test_tokenization()
-{
-  StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
-  StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
-  StringPiece line3 = StringPiece("! ! ! ) , ||| ! ! ! ) - , ||| 0.0804289 0.075225 0.0804289 0.00310345 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 ||| 1 1 1");
-  StringPiece line4 = StringPiece("! ! ! ) ||| ! ! ! ) . ||| 0.0804289 0.177547 0.0268096 0.000872597 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 3 1");
-
-  line_text output1 = splitLine(line1);
-  line_text output2 = splitLine(line2);
-  line_text output3 = splitLine(line3);
-  line_text output4 = splitLine(line4);
-
-  bool test1 = output1.prob == StringPiece("0.0804289 0.141656 0.0804289 0.443409 2.718");
-  bool test2 = output2.word_all1 == StringPiece("0-0 1-1 2-2 3-3 4-4 4-5 5-6");
-  bool test3 = output2.target_phrase == StringPiece("! ! ! ) - , a");
-  bool test4 = output3.source_phrase == StringPiece("! ! ! ) ,");
-  bool test5 = output4.word_all2 == StringPiece("1 3 1");
-
-  //std::cout << test1 << " " << test2 << " " << test3 << " " << test4 << std::endl;
-
-  return (test1 && test2 && test3 && test4 && test5);
-
-}
-
-bool test_linesplitter()
-{
-  StringPiece line1 = StringPiece("! &#93;    0.0738539 0.901133 0.0738539 0.65207 2.718  0-0 1-1 1 1 1");
-  target_text ans1;
-  ans1 = splitSingleTargetLine(line1);
-
-  /* For testing purposes
-  std::cout << ans1.target_phrase[0] << " " <<ans1.target_phrase[1] << " Size: " << ans1.target_phrase.size() << std::endl;
-  std::cout << ans1.word_all1[3] << " " << ans1.word_all2[2] << " " << ans1.prob[3] << std::endl; */
-
-  return (ans1.target_phrase.size() == 2 && ans1.prob.size() == 5 && ans1.word_all1.size() == 4 && ans1.word_all2.size() == 3);
-}
-
-bool test_linessplitter()
-{
-  StringPiece line1 = StringPiece("! &#93;    0.0738539 0.901133 0.0738539 0.65207 2.718  0-0 1-1 1 1 1\n\n! ) . proto doÅ¡lo 0.0738539 7.14446e-06");
-  StringPiece line2 = StringPiece("! &quot; ) 0.536553 0.75961 0.634108 0.532927 2.718    0-0 1-1 2-2 13 11 8\n! ) .  0.0369269 0.00049839 0.00671399 0.00372884 2.718    0-0 1-1 2-1 2-2 2 11 1\n&quot; ! )  0.0738539 0.75961 0.00671399 0.532927 2.718 1-0 0-1 2-2 1 11 1\nse ! &quot; )   0.0738539 0.75961 0.00671399 0.0225211 2.718    0-1 1-2 2-3 1 11 1\n\n! &quot; , a to   0.0738539 0.0894238 0.0738539 0.048");
-
-  std::vector<target_text> ans1;
-  std::vector<target_text> ans2;
-
-  ans1 = splitTargetLine(line1);
-  ans2 = splitTargetLine(line2);
-
-  bool sizes = ans1.size() == 1 && ans2.size() == 4;
-  bool prob = ans1[0].prob[3] == 0.65207 && ans2[1].prob[1] == 0.00049839;
-  bool word_alls = ans2[0].word_all2[1] == 11 && ans2[3].word_all1[5] == 3;
-
-  /* FOr testing
-  std::cout << ans1.size() << std::endl;
-  std::cout << ans2.size() << std::endl;
-  std::cout << ans1[0].prob[3] << std::endl;
-  std::cout << ans2[1].prob[1] << std::endl;
-  std::cout << ans2[0].word_all2[1] << std::endl;
-  std::cout << ans2[3].word_all1[5] << std::endl; */
-
-  return sizes && prob && word_alls;
-}
-
-int main()
-{
-  if (probabilitiesTest() && wordAll1test() && wordAll2test() && test_tokenization() && test_linesplitter() && test_linessplitter() && test_vectorinsert()) {
-    std::cout << "All tests pass!" << std::endl;
-  } else {
-    std::cout << "Failiure in some tests!" << std::endl;
-  }
-
-  return 1;
-}
--- a/moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp
+++ b/moses/TranslationModel/ProbingPT/tests/vocabid_test.cpp
@ -1,46 +0,0 @@
-#include <map> //Map for vocab ids
-
-#include "hash.hh"
-#include "vocabid.hh"
-
-int main(int argc, char* argv[])
-{
-
-  //Create a map and serialize it
-  std::map<uint64_t, std::string> vocabids;
-  StringPiece demotext = StringPiece("Demo text with 3 elements");
-  add_to_map(&vocabids, demotext);
-  //Serialize map
-  serialize_map(&vocabids, "/tmp/testmap.bin");
-
-  //Read the map and test if the values are the same
-  std::map<uint64_t, std::string> newmap;
-  read_map(&newmap, "/tmp/testmap.bin");
-
-  //Used hashes
-  uint64_t num1 = getHash(StringPiece("Demo"));
-  uint64_t num2 = getVocabID("text");
-  uint64_t num3 = getHash(StringPiece("with"));
-  uint64_t num4 = getVocabID("3");
-  uint64_t num5 = getHash(StringPiece("elements"));
-  uint64_t num6 = 0;
-
-  //Tests
-  bool test1 = getStringFromID(&newmap, num1) == getStringFromID(&vocabids, num1);
-  bool test2 = getStringFromID(&newmap, num2) == getStringFromID(&vocabids, num2);
-  bool test3 = getStringFromID(&newmap, num3) == getStringFromID(&vocabids, num3);
-  bool test4 = getStringFromID(&newmap, num4) == getStringFromID(&vocabids, num4);
-  bool test5 = getStringFromID(&newmap, num5) == getStringFromID(&vocabids, num5);
-  bool test6 = getStringFromID(&newmap, num6) == getStringFromID(&vocabids, num6);
-
-
-  if (test1 && test2 && test3 && test4 && test5 && test6) {
-    std::cout << "Map was successfully written and read!" << std::endl;
-  } else {
-    std::cout << "Error! " << test1 << " " << test2 << " " << test3 << " " << test4 << " " << test5 << " " << test6 << std::endl;
-  }
-
-
-  return 1;
-
-}
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 use Getopt::Std;
 getopts('q');
--- a/scripts/OSM/flipAlignment.perl
+++ b/scripts/OSM/flipAlignment.perl
@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl 
  use strict;

  my $file = shift(@ARGV);
--- a/scripts/Transliteration/clean.pl
+++ b/scripts/Transliteration/clean.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 #input hindi word urdu word, delete all those entries that have number on any side
 use utf8;
--- a/scripts/Transliteration/corpusCreator.pl
+++ b/scripts/Transliteration/corpusCreator.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/Transliteration/threshold.pl
+++ b/scripts/Transliteration/threshold.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 use utf8;
 require Encode;
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use utf8;
 use strict;
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
 use utf8;

 ###############################################
--- a/scripts/analysis/nontranslated_words.pl
+++ b/scripts/analysis/nontranslated_words.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 # $Id$
 # Reads a source and hypothesis file and counts equal tokens. Some of these
--- a/scripts/analysis/oov.pl
+++ b/scripts/analysis/oov.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
 # Display OOV rate of a test set against a training corpus or a phrase table.
 # Ondrej Bojar

--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
--- a/scripts/analysis/sg2dot.perl
+++ b/scripts/analysis/sg2dot.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 
 # 
 # Author : Loic BARRAULT
 # Script to convert MOSES searchgraph to DOT format
--- a/scripts/analysis/show-phrases-used.pl
+++ b/scripts/analysis/show-phrases-used.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 #show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 #by Philipp Koehn, de-augmented by Evan Herbst
--- a/scripts/analysis/suspicious_tokenization.pl
+++ b/scripts/analysis/suspicious_tokenization.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
 # Collects and prints all n-grams that appear in the given corpus both
 # tokenized as well as untokenized.
 # Ondrej Bojar
--- a/scripts/analysis/weight-scan.pl
+++ b/scripts/analysis/weight-scan.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
 # runs Moses many times changing the values of one weight, all others fixed
 # nbest lists are always produced to allow for comparison of real and
 # 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -185,7 +185,7 @@ lowercase
 	default-name: lm/lowercased
 	pass-unless: output-lowercaser
 	ignore-if: output-truecaser
-	only-factor-0: yes
+	#only-factor-0: yes
 	template: $output-lowercaser < IN > OUT
 	parallelizable: yes
 truecase
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # Experiment Management System
 # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
@ -18,7 +18,18 @@ sub trim($)
 my $host = `hostname`; chop($host);
 print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;

-my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$FINAL_STEP,$FINAL_OUT,$VERBOSE,$IGNORE_TIME,$DELETE_CRASHED,$DELETE_VERSION);
+my ($CONFIG_FILE,
+		$EXECUTE,
+		$NO_GRAPH,
+		$CONTINUE,
+		$FINAL_STEP,
+		$FINAL_OUT,
+		$VERBOSE,
+		$IGNORE_TIME,
+		$DELETE_CRASHED,
+		$DELETE_VERSION
+		);
+		
 my $SLEEP = 2;
 my $META = "$RealBin/experiment.meta";

@ -3442,7 +3453,7 @@ sub create_step {
    $subdir = "lm" if $subdir eq "interpolated-lm";
    open(STEP,">$file") or die "Cannot open: $!";
    print STEP "#!/bin/bash\n\n";
-    print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
+    print STEP "PATH=\"".$ENV{"PATH"}."\"\n";  	
    print STEP "cd $dir\n";
    print STEP "echo 'starting at '`date`' on '`hostname`\n";
    print STEP "mkdir -p $dir/$subdir\n\n";
--- a/scripts/ems/fix-info.perl
+++ b/scripts/ems/fix-info.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/consolidate-training-data.perl
+++ b/scripts/ems/support/consolidate-training-data.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $

--- a/scripts/ems/support/generic-multicore-parallelizer.perl
+++ b/scripts/ems/support/generic-multicore-parallelizer.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/input-from-sgm.perl
+++ b/scripts/ems/support/input-from-sgm.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use IPC::Open3;
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/ems/support/mml-filter.perl
+++ b/scripts/ems/support/mml-filter.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use FindBin qw($RealBin);
--- a/scripts/ems/support/mml-score.perl
+++ b/scripts/ems/support/mml-score.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/mml-train.perl
+++ b/scripts/ems/support/mml-train.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/prepare-fast-align.perl
+++ b/scripts/ems/support/prepare-fast-align.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/reference-from-sgm.perl
+++ b/scripts/ems/support/reference-from-sgm.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/remove-segmentation-markup.perl
+++ b/scripts/ems/support/remove-segmentation-markup.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/report-experiment-scores.perl
+++ b/scripts/ems/support/report-experiment-scores.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $

--- a/scripts/ems/support/run-command-on-multiple-refsets.perl
+++ b/scripts/ems/support/run-command-on-multiple-refsets.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/run-wade.perl
+++ b/scripts/ems/support/run-wade.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 use strict;
 use File::Temp qw/ tempfile tempdir /;
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # Based on Preprocessor written by Philipp Koehn

--- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl
+++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/env perl 

 # experiment.perl support script
 # get filtered rule and reordering tables and place them into a configuration file
--- a/scripts/ems/support/substitute-weights.perl
+++ b/scripts/ems/support/substitute-weights.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/env perl 

 # experiment.perl support script
 # get filtered rule and reordering tables and place them into a configuration file
--- a/scripts/ems/support/symmetrize-fast-align.perl
+++ b/scripts/ems/support/symmetrize-fast-align.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/support/thot-lm-wrapper.perl
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/ems/support/tree-converter-wrapper.perl
+++ b/scripts/ems/support/tree-converter-wrapper.perl
@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl 

 use warnings;
 use strict;
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/ems/web/progress.perl
+++ b/scripts/ems/web/progress.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use Date::Parse;
--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/env perl 

 binmode( STDIN,  ":utf8" );
 binmode( STDOUT, ":utf8" );
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/generic/extract-factors.pl
+++ b/scripts/generic/extract-factors.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 #extract-factors.pl: extract only the desired factors from a factored corpus
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@ -1,4 +1,4 @@
-#! /usr/bin/perl -w 
+#!/usr/bin/env perl 

 # example
 #  ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
--- a/scripts/generic/fsa2fsal.pl
+++ b/scripts/generic/fsa2fsal.pl
@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl 
 # A very simple script that converts fsa format (openfst lattices) to the same
 # thing represented one sentence per line. It uses '|||' to delimit columns and
 # ' ' to delimit nodes (i.e. original lines).
--- a/scripts/generic/fsa2plf.pl
+++ b/scripts/generic/fsa2plf.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
 # Converts AT&T FSA format to 'python lattice format'.
 # Note that the input FSA needs to be epsilon-free and topologically sorted.
 # This script checks for topological sortedness.
--- a/scripts/generic/fsal2fsa.pl
+++ b/scripts/generic/fsal2fsa.pl
@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl 
 # A very simple script that converts fsal back to fsa format (openfst lattices)
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz

--- a/scripts/generic/generic-parallel.perl
+++ b/scripts/generic/generic-parallel.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use utf8;
--- a/scripts/generic/giza-parallel.perl
+++ b/scripts/generic/giza-parallel.perl
@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl 

 # example
 # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
--- a/scripts/generic/lopar2pos.pl
+++ b/scripts/generic/lopar2pos.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 #lopar2pos: extract POSs from LOPAR output
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl 

 # $Id$
 #######################
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 
 
 use strict;
 use utf8;
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 use warnings;
 use strict;
--- a/scripts/generic/multi-bleu.perl
+++ b/scripts/generic/multi-bleu.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 use strict;
--- a/scripts/generic/ph_numbers.perl
+++ b/scripts/generic/ph_numbers.perl
@ -1,4 +1,5 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 
+
 package ph_numbers;

 # Script to recognize and replace numbers in Moses training corpora
--- a/scripts/generic/qsub-wrapper.pl
+++ b/scripts/generic/qsub-wrapper.pl
@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl 

 # $Id$
 use strict;
--- a/scripts/generic/reverse-alignment.perl
+++ b/scripts/generic/reverse-alignment.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@ -1,4 +1,4 @@
-#! /usr/bin/perl -w 
+#!/usr/bin/env perl 

 # example
 # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e  --GoodTuring ./phrase-table.2.coc 0
--- a/scripts/generic/strip-xml.perl
+++ b/scripts/generic/strip-xml.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w 
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/generic/trainlm-irst2.perl
+++ b/scripts/generic/trainlm-irst2.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # Compatible with sri LM-creating script, eg.
 #    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
--- a/scripts/generic/trainlm-lmplz.perl
+++ b/scripts/generic/trainlm-lmplz.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # Compatible with sri LM-creating script, eg.
 #    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
--- a/scripts/other/beautify.perl
+++ b/scripts/other/beautify.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl 
+#!/usr/bin/env perl 

 use strict;
 use File::Basename;
--- a/scripts/other/convert-pt.perl
+++ b/scripts/other/convert-pt.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 # $Id$
 # convert a phrase-table with alignment in Moses' dead-end format
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl 
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/other/get_many_translations_from_google.perl
+++ b/scripts/other/get_many_translations_from_google.perl
@ -1,4 +1,5 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
+
 # Uses Google AJAX API to collect many translations, i.e. create a parallel
 # corpus of Google translations.
 # Expects one sentence per line, not tokenized!
--- a/scripts/other/retain-lines.perl
+++ b/scripts/other/retain-lines.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 #retain lines in clean.lines-retained.1
 use strict;
--- a/scripts/other/translate_by_microsoft_bing.perl
+++ b/scripts/other/translate_by_microsoft_bing.perl
@ -1,4 +1,5 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 
+
 # Script implemented by Pranava Swaroop Madhyastha (a student at Charles
 # University, UFAL)

--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 use strict;
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id$
 use strict;
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $

--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
 use strict;
--- a/scripts/regression-testing/compare-results.pl
+++ b/scripts/regression-testing/compare-results.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 my ($results, $truth) = @ARGV;
--- a/scripts/regression-testing/create_localized_moses_ini.pl
+++ b/scripts/regression-testing/create_localized_moses_ini.pl
@ -1,4 +1,4 @@
-#! /usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
--- a/scripts/regression-testing/modify-pars.pl
+++ b/scripts/regression-testing/modify-pars.pl
@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl 

 use strict;
 	
--- a/scripts/regression-testing/moses-virtual.pl
+++ b/scripts/regression-testing/moses-virtual.pl
@ -1,4 +1,4 @@
-#! /usr/bin/perl
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/regression-testing/run-single-test.pl
+++ b/scripts/regression-testing/run-single-test.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;
 my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
--- a/scripts/regression-testing/run-test-suite.pl
+++ b/scripts/regression-testing/run-test-suite.pl
@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl 

 use strict;
 my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
--- a/scripts/tokenizer/deescape-special-chars-PTB.perl
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 use strict;

--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl 

 # $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
 # Sample De-Tokenizer
--- a/Show More
+++ b/Show More