Merge branch 'master' of https://github.com/moses-smt/mosesdecoder

2024-09-20 07:42:21 +03:00 · 2014-06-02 14:05:40 +02:00 · 2014-06-02 14:05:40 +02:00 · bae4ad6253
commit bae4ad6253
parent 3213d05158 c4d0f7dc93
10 changed files with 161 additions and 245 deletions
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@ -1,158 +1,3 @@
-PRELIMINARIES
+Please see the Moses website on how to compile and run Moses
+   http://www.statmt.org/moses/?n=Development.GetStarted

-Moses is primarily targeted at gcc on UNIX.  
-
-Moses requires gcc, Boost >= 1.36, and zlib including the headers that some
-distributions package separately (i.e. -dev or -devel packages).  Source is
-available at http://boost.org .
-
-There are several optional dependencies:
-
-GIZA++ from http://code.google.com/p/giza-pp/ is used to align words in the parallel corpus during training.
-
-Moses server requires xmlrpc-c with abyss-server.  Source is available from
-http://xmlrpc-c.sourceforge.net/.  
-
-The scripts support building ARPA format language models with SRILM or IRSTLM.
-To apply models inside the decoder, you can use SRILM, IRSTLM, or KenLM.  The
-ARPA format is exchangable so that e.g. you can build a model with SRILM and
-run the decoder with IRSTLM or KenLM.
-
-If you want to use SRILM, you will need to download its source and build it.  
-The SRILM can be downloaded from 
-http://www.speech.sri.com/projects/srilm/download.html .
-On x86_64, the default machine type is broken.  Edit sbin/machine-type, find
-this code
-    else if (`uname -m` == x86_64) then
-        set MACHINE_TYPE = i686
-and change it to
-    else if (`uname -m` == x86_64) then
-        set MACHINE_TYPE = i686-m64
-You may have to chmod +w sbin/machine-type first.  
-
-If you want to use IRSTLM, you will need to download its source and build it.  
-The IRSTLM can be downloaded from either the SourceForge website
-http://sourceforge.net/projects/irstlm
-or the official IRSTLM website
-http://hlt.fbk.eu/en/irstlm
-
-KenLM is included with Moses.  
-
--------------------------------------------------------------------------
-
-ADVICE ON INSTALLING EXTERNAL LIBRARIES
-
-Generally, for trouble installing external libraries, you should get support
-directly from the library maker:
-
-Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
-IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
-SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user
-
-However, here's some general advice on installing software (for bash users):
-
-#Determine where you want to install packages
-PREFIX=$HOME/usr
-#If your system has lib64 directories, lib64 should be used AND NOT lib  
-if [ -d /lib64 ]; then
-  LIBDIR=$PREFIX/lib64
-else
-  LIBDIR=$PREFIX/lib
-fi
-#If you're installing to a non-standard path, tell programs where to find things:
-export PATH=$PREFIX/bin${PATH:+:$PATH}
-export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
-export LIBRARY_PATH=$LIBDIR${LIBRARY_PATH:+:$LIBRARY_PATH}
-export CPATH=$PREFIX/include${CPATH:+:$CPATH}
-
-Add all the above code to your .bashrc or .bash_login as appropriate.  Then
-you're ready to install packages in non-standard paths:
-
-#For autotools packages e.g. xmlrpc-c and zlib
-./configure --prefix=$PREFIX --libdir=$LIBDIR [other options here]
-
-#tcmalloc is a malloc implementation with threaded performance.  To see how it
-#improves Moses performance, read
-# http://www.mail-archive.com/moses-support@mit.edu/msg07303.html
-#It is part of gperftools which can be downloaded from from
-# https://code.google.com/p/gperftools/downloads/list
-#configure with this:
-./configure --prefix=$PREFIX --libdir=$LIBDIR --enable-shared --enable-static --enable-minimal
-
-#For bzip2:
-wget http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
-tar xzvf bzip2-1.0.6.tar.gz
-cd bzip2-1.0.6/
-#Compile and install libbz2.a (static library)
-make
-make install PREFIX=$PREFIX
-mkdir -p $LIBDIR
-#Note this may be the same file; you can ignore the error
-mv $PREFIX/lib/libbz2.a $LIBDIR 2>/dev/null
-#Compile and install libbz2.so (dynamic library)
-make clean
-make -f Makefile-libbz2_so
-cp libbz2.so.* $LIBDIR
-ln -sf libbz2.so.1.0 $LIBDIR/libbz2.so
-
-#For Boost:
-./bootstrap.sh
-./b2 --prefix=$PWD --libdir=$PWD/lib64 --layout=tagged link=static,shared threading=multi,single install || echo FAILURE
-
-This will put the header files and libraries files in the current directory, rather than the system directory.
-
-For most Linux systems, you should replace
-  link=static,shared
-with
-  link=static
-so it will only create static libraries. The minimised headaches when linking with Moses. 
-
-To link Moses to your version of boost,
-   ./bjam --with-boost=[boost/path]
-
-Alternatively, you run 
-   ./b2 --prefix=/usr/ --libdir=/usr/lib
-to install boost in the systems folder. However, this may override the built in boost and causes problems for your OS, therefore, it is not recommended.
-
--------------------------------------------------------------------------
-
-BUILDING
-
-Building consists of running
-  ./bjam [options]
-
-Common options are:
--with-srilm=/path/to/srilm to compile the decoder with SRILM support
--with-irstlm=/path/to/irstlm to compile the decoder with IRSTLM support
-jN where N is the number of CPUs
-
--with-macports=/path/to/macports use MacPorts on Mac OS X.
-
-If you leave out /path/to/macports bjam will use the /opt/local as default.
-You don't have to use --with-boost with-macports as it is implicitly set.
-Also note that using --with-macports automatically triggers "using darwin".
-
-Binaries will appear in dist/bin.  
-
-You can clean up data from previous builds using
-  ./bjam --clean
-
-For further documentation, run 
-  ./bjam --help
-
--------------------------------------------------------------------------
-
-ALTERNATIVE WAYS TO BUILD ON UNIX AND OTHER PLATFORMS
-
-Microsoft Windows
-----------------
-Moses is primarily targeted at gcc on UNIX.  Windows users should
-install using Cygwin. Outdated instructions can be found here:
-http://ssli.ee.washington.edu/people/amittai/Moses-on-Win7.pdf .
-
-Binaries for all external libraries needed can be downloaded from 
-	http://www.statmt.org/moses/?n=Moses.LibrariesUsed
-
-Only the decoder is developed and tested under Windows. There are
-difficulties using the training scripts under Windows, even with
-Cygwin, but it can be done.
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@ -279,6 +279,12 @@ public:
        manager.ProcessSentence();
        const ChartHypothesis *hypo = manager.GetBestHypothesis();
        outputChartHypo(out,hypo);
+        if (addGraphInfo) {
+          const size_t translationId = tinput.GetTranslationId();
+          std::ostringstream sgstream;
+          manager.GetSearchGraph(translationId,sgstream);
+          retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
+        }
    } else {
        Sentence sentence;
        const vector<FactorType> &inputFactorOrder =
@ -310,7 +316,7 @@ public:
          retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
        }

-        if(addGraphInfo) {
+        if (addGraphInfo) {
          insertGraphInfo(manager,retData);
            (const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
        }
--- a/moses/FF/ConstrainedDecoding.cpp
+++ b/moses/FF/ConstrainedDecoding.cpp
@ -25,7 +25,7 @@ ConstrainedDecodingState::ConstrainedDecodingState(const ChartHypothesis &hypo)
 int ConstrainedDecodingState::Compare(const FFState& other) const
 {
  const ConstrainedDecodingState &otherFF = static_cast<const ConstrainedDecodingState&>(other);
-  int ret = 	m_outputPhrase.Compare(otherFF.m_outputPhrase);
+  int ret =     m_outputPhrase.Compare(otherFF.m_outputPhrase);
  return ret;
 }

@ -34,6 +34,7 @@ ConstrainedDecoding::ConstrainedDecoding(const std::string &line)
  :StatefulFeatureFunction(1, line)
  ,m_maxUnknowns(0)
  ,m_negate(false)
+  ,m_soft(false)
 {
  m_tuneable = false;
  ReadParameters();
@ -44,47 +45,48 @@ void ConstrainedDecoding::Load()
  const StaticData &staticData = StaticData::Instance();
  bool addBeginEndWord = (staticData.GetSearchAlgorithm() == ChartDecoding) || (staticData.GetSearchAlgorithm() == ChartIncremental);

-  InputFileStream constraintFile(m_path);
-  std::string line;
-  long sentenceID = staticData.GetStartTranslationId() - 1;
-  while (getline(constraintFile, line)) {
-    vector<string> vecStr = Tokenize(line, "\t");
-
-    Phrase phrase(0);
-    if (vecStr.size() == 1) {
-      sentenceID++;
-      phrase.CreateFromString(Output, staticData.GetOutputFactorOrder(), vecStr[0], staticData.GetFactorDelimiter(), NULL);
-    } else if (vecStr.size() == 2) {
-      sentenceID = Scan<long>(vecStr[0]);
-      phrase.CreateFromString(Output, staticData.GetOutputFactorOrder(), vecStr[1], staticData.GetFactorDelimiter(), NULL);
-    } else {
-      UTIL_THROW(util::Exception, "Reference file not loaded");
+  for(size_t i = 0; i < m_paths.size(); ++i) {
+    InputFileStream constraintFile(m_paths[i]);
+    std::string line;
+    long sentenceID = staticData.GetStartTranslationId() - 1;
+    while (getline(constraintFile, line)) {
+      vector<string> vecStr = Tokenize(line, "\t");
+  
+      Phrase phrase(0);
+      if (vecStr.size() == 1) {
+        sentenceID++;
+        phrase.CreateFromString(Output, staticData.GetOutputFactorOrder(), vecStr[0], staticData.GetFactorDelimiter(), NULL);
+      } else if (vecStr.size() == 2) {
+        sentenceID = Scan<long>(vecStr[0]);
+        phrase.CreateFromString(Output, staticData.GetOutputFactorOrder(), vecStr[1], staticData.GetFactorDelimiter(), NULL);
+      } else {
+        UTIL_THROW(util::Exception, "Reference file not loaded");
+      }
+  
+      if (addBeginEndWord) {
+        phrase.InitStartEndWord();
+      }
+      m_constraints[sentenceID].push_back(phrase);
    }
-
-    if (addBeginEndWord) {
-      phrase.InitStartEndWord();
-    }
-    m_constraints.insert(make_pair(sentenceID,phrase));
-
  }
 }

 std::vector<float> ConstrainedDecoding::DefaultWeights() const
 {
  UTIL_THROW_IF2(m_numScoreComponents != 1,
-		  "ConstrainedDecoding must only have 1 score");
+          "ConstrainedDecoding must only have 1 score");
  vector<float> ret(1, 1);
  return ret;
 }

 template <class H, class M>
-const Phrase *GetConstraint(const std::map<long,Phrase> &constraints, const H &hypo)
+const std::vector<Phrase> *GetConstraint(const std::map<long,std::vector<Phrase> > &constraints, const H &hypo)
 {
  const M &mgr = hypo.GetManager();
  const InputType &input = mgr.GetSource();
  long id = input.GetTranslationId();

-  map<long,Phrase>::const_iterator iter;
+  map<long,std::vector<Phrase> >::const_iterator iter;
  iter = constraints.find(id);

  if (iter == constraints.end()) {
@ -101,30 +103,37 @@ FFState* ConstrainedDecoding::Evaluate(
  const FFState* prev_state,
  ScoreComponentCollection* accumulator) const
 {
-  const Phrase *ref = GetConstraint<Hypothesis, Manager>(m_constraints, hypo);
+  const std::vector<Phrase> *ref = GetConstraint<Hypothesis, Manager>(m_constraints, hypo);
  assert(ref);

  ConstrainedDecodingState *ret = new ConstrainedDecodingState(hypo);
-  const Phrase &outputPhrase = ret->GetPhrase();
+  const Phrase &outputPhrase = ret->GetPhrase(); 

-  size_t searchPos = ref->Find(outputPhrase, m_maxUnknowns);
+  size_t searchPos = NOT_FOUND;
+  size_t i = 0;
+  size_t size = 0;
+  while(searchPos == NOT_FOUND && i < ref->size()) {
+    searchPos = (*ref)[i].Find(outputPhrase, m_maxUnknowns);
+    size = (*ref)[i].GetSize();
+    i++;
+  }

  float score;
  if (hypo.IsSourceCompleted()) {
    // translated entire sentence.
-	bool match = (searchPos == 0) && (ref->GetSize() == outputPhrase.GetSize());
-	if (!m_negate) {
-		score = match ? 0 : - std::numeric_limits<float>::infinity();
-	}
-	else {
-		score = !match ? 0 : - std::numeric_limits<float>::infinity();
-	}
+    bool match = (searchPos == 0) && (size == outputPhrase.GetSize());
+    if (!m_negate) {
+        score = match ? 0 : - ( m_soft ? 1 : std::numeric_limits<float>::infinity());
+    }
+    else {
+        score = !match ? 0 : - ( m_soft ? 1 : std::numeric_limits<float>::infinity());
+    }
  } else if (m_negate) {
-	// keep all derivations
-	score = 0;
+    // keep all derivations
+    score = 0;
  }
  else {
-    score = (searchPos != NOT_FOUND) ? 0 : - std::numeric_limits<float>::infinity();
+    score = (searchPos != NOT_FOUND) ? 0 : - ( m_soft ? 1 : std::numeric_limits<float>::infinity());
  }

  accumulator->PlusEquals(this, score);
@ -137,7 +146,7 @@ FFState* ConstrainedDecoding::EvaluateChart(
  int /* featureID - used to index the state in the previous hypotheses */,
  ScoreComponentCollection* accumulator) const
 {
-  const Phrase *ref = GetConstraint<ChartHypothesis, ChartManager>(m_constraints, hypo);
+  const std::vector<Phrase> *ref = GetConstraint<ChartHypothesis, ChartManager>(m_constraints, hypo);
  assert(ref);

  const ChartManager &mgr = hypo.GetManager();
@ -145,25 +154,33 @@ FFState* ConstrainedDecoding::EvaluateChart(

  ConstrainedDecodingState *ret = new ConstrainedDecodingState(hypo);
  const Phrase &outputPhrase = ret->GetPhrase();
-  size_t searchPos = ref->Find(outputPhrase, m_maxUnknowns);

+  size_t searchPos = NOT_FOUND;
+  size_t i = 0;
+  size_t size = 0;
+  while(searchPos == NOT_FOUND && i < ref->size()) {
+    searchPos = (*ref)[i].Find(outputPhrase, m_maxUnknowns);
+    size = (*ref)[i].GetSize();
+    i++;
+  }
+  
  float score;
  if (hypo.GetCurrSourceRange().GetStartPos() == 0 &&
      hypo.GetCurrSourceRange().GetEndPos() == source.GetSize() - 1) {
    // translated entire sentence.
-	bool match = (searchPos == 0) && (ref->GetSize() == outputPhrase.GetSize());
+    bool match = (searchPos == 0) && (size == outputPhrase.GetSize());

-	if (!m_negate) {
-		score = match ? 0 : - std::numeric_limits<float>::infinity();
-	}
-	else {
-		score = !match ? 0 : - std::numeric_limits<float>::infinity();
-	}
+    if (!m_negate) {
+        score = match ? 0 : - ( m_soft ? 1 : std::numeric_limits<float>::infinity());
+    }
+    else {
+        score = !match ? 0 : - ( m_soft ? 1 : std::numeric_limits<float>::infinity());
+    }
  } else if (m_negate) {
-	// keep all derivations
-	score = 0;
+    // keep all derivations
+    score = 0;
  } else {
-    score = (searchPos != NOT_FOUND) ? 0 : - std::numeric_limits<float>::infinity();
+    score = (searchPos != NOT_FOUND) ? 0 : - ( m_soft ? 1 : std::numeric_limits<float>::infinity());
  }

  accumulator->PlusEquals(this, score);
@ -174,11 +191,13 @@ FFState* ConstrainedDecoding::EvaluateChart(
 void ConstrainedDecoding::SetParameter(const std::string& key, const std::string& value)
 {
  if (key == "path") {
-    m_path = value;
+    m_paths = Tokenize(value, ",");
  } else if (key == "max-unknowns") {
    m_maxUnknowns = Scan<int>(value);
  } else if (key == "negate") {
-	m_negate = Scan<bool>(value);
+    m_negate = Scan<bool>(value);
+  } else if (key == "soft") {
+    m_soft = Scan<bool>(value);
  } else {
    StatefulFeatureFunction::SetParameter(key, value);
  }
--- a/moses/FF/ConstrainedDecoding.h
+++ b/moses/FF/ConstrainedDecoding.h
@ -46,13 +46,15 @@ public:
                , ScoreComponentCollection &scoreBreakdown
                , ScoreComponentCollection &estimatedFutureScore) const
  {}
-  void Evaluate(const InputType &input
+  
+    void Evaluate(const InputType &input
                , const InputPath &inputPath
                , const TargetPhrase &targetPhrase
                , const StackVec *stackVec
                , ScoreComponentCollection &scoreBreakdown
                , ScoreComponentCollection *estimatedFutureScore = NULL) const
  {}
+  
  FFState* Evaluate(
    const Hypothesis& cur_hypo,
    const FFState* prev_state,
@ -72,10 +74,11 @@ public:
  void SetParameter(const std::string& key, const std::string& value);

 protected:
-  std::string m_path;
-  std::map<long,Phrase> m_constraints;
+  std::vector<std::string> m_paths;
+  std::map<long, std::vector<Phrase> > m_constraints;
  int m_maxUnknowns;
  bool m_negate; // only keep translations which DON'T match the reference
+  bool m_soft;

 };

--- a/moses/FF/NieceTerminal.cpp
+++ b/moses/FF/NieceTerminal.cpp
@ -10,6 +10,13 @@ using namespace std;

 namespace Moses
 {
+NieceTerminal::NieceTerminal(const std::string &line)
+  :StatelessFeatureFunction(line)
+  ,m_hardConstraint(false)
+{
+  ReadParameters();
+}
+
 void NieceTerminal::Evaluate(const Phrase &source
                                   , const TargetPhrase &targetPhrase
                                   , ScoreComponentCollection &scoreBreakdown
@ -25,6 +32,8 @@ void NieceTerminal::Evaluate(const InputType &input
                                   , ScoreComponentCollection &scoreBreakdown
                                   , ScoreComponentCollection *estimatedFutureScore) const
 {
+  assert(stackVec);
+
  const Phrase *ruleSource = targetPhrase.GetRuleSource();
  assert(ruleSource);

@ -36,23 +45,19 @@ void NieceTerminal::Evaluate(const InputType &input
 	  }
  }

-  size_t ntInd = 0;
-  for (size_t i = 0; i < ruleSource->GetSize(); ++i) {
-	  const Word &word = ruleSource->GetWord(i);
-	  if (word.IsNonTerminal()) {
-		  const ChartCellLabel &cell = *stackVec->at(ntInd);
-		  const WordsRange &ntRange = cell.GetCoverage();
-		  bool containTerm = ContainTerm(input, ntRange, terms);
+  for (size_t i = 0; i < stackVec->size(); ++i) {
+	  const ChartCellLabel &cell = *stackVec->at(i);
+	  const WordsRange &ntRange = cell.GetCoverage();
+	  bool containTerm = ContainTerm(input, ntRange, terms);

-		  if (containTerm) {
-			  //cerr << "ruleSource=" << *ruleSource << " ";
-			  //cerr << "ntRange=" << ntRange << endl;
+	  if (containTerm) {
+		  //cerr << "ruleSource=" << *ruleSource << " ";
+		  //cerr << "ntRange=" << ntRange << endl;

-			  // non-term contains 1 of the terms in the rule.
-			  scoreBreakdown.PlusEquals(this, 1);
-			  return;
-		  }
-		  ++ntInd;
+		  // non-term contains 1 of the terms in the rule.
+		  float score = m_hardConstraint ? - std::numeric_limits<float>::infinity() : 1;
+		  scoreBreakdown.PlusEquals(this, score);
+		  return;
 	  }
  }

@ -83,5 +88,16 @@ bool NieceTerminal::ContainTerm(const InputType &input,
 	return false;
 }

+void NieceTerminal::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "hard-constraint") {
+	  m_hardConstraint = Scan<bool>(value);
+  } else {
+    StatelessFeatureFunction::SetParameter(key, value);
+  }
 }

+
+}
+
+
--- a/moses/FF/NieceTerminal.h
+++ b/moses/FF/NieceTerminal.h
@ -13,9 +13,7 @@ class Word;
 class NieceTerminal : public StatelessFeatureFunction
 {
 public:
-  NieceTerminal(const std::string &line)
-    :StatelessFeatureFunction(line)
-  {}
+  NieceTerminal(const std::string &line);

  bool IsUseable(const FactorMask &mask) const {
    return true;
@ -36,7 +34,10 @@ public:
  void EvaluateChart(const ChartHypothesis &hypo,
                     ScoreComponentCollection* accumulator) const;

+  void SetParameter(const std::string& key, const std::string& value);
+
 protected:
+  bool m_hardConstraint;
  bool ContainTerm(const InputType &input,
 		  	  	  const WordsRange &ntRange,
 		  	  	  const std::set<Word> &terms) const;
@ -44,3 +45,4 @@ protected:

 }

+
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -191,6 +191,7 @@ Parameter::Parameter()
  AddParam("weight", "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated");
  AddParam("weight-overwrite", "special parameter for mert. All on 1 line. Overrides weights specified in 'weights' argument");
  AddParam("feature-overwrite", "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\"");
+  AddParam("weight-add", "Add weight for FF if it doesn't exist, i.e weights here are added 1st, and can be override by the ini file or on the command line. Used to specify initial weights for FF that was also specified on the copmmand line");
  AddParam("feature-add", "Add a feature function on the command line. Used by mira to add BLEU feature");
  AddParam("feature-name-overwrite", "Override feature name (NOT arguments). Eg. SRILM-->KENLM, PhraseDictionaryMemory-->PhraseDictionaryScope3");

@ -903,24 +904,28 @@ void Parameter::ConvertWeightArgs()

 void Parameter::CreateWeightsMap()
 {
-  PARAM_VEC &vec = m_setting["weight"];
+  CreateWeightsMap(m_setting["weight-add"]);
+  CreateWeightsMap(m_setting["weight"]);
+}
+
+void Parameter::CreateWeightsMap(const PARAM_VEC &vec)
+{
  for (size_t i = 0; i < vec.size(); ++i) {
-    const string &line = vec[i];
-    vector<string> toks = Tokenize(line);
-    UTIL_THROW_IF2(toks.size() < 2,
-    		"Error in format of weights: " << line);
+	const string &line = vec[i];
+	vector<string> toks = Tokenize(line);
+	UTIL_THROW_IF2(toks.size() < 2,
+			"Error in format of weights: " << line);

-    string name = toks[0];
-    name = name.substr(0, name.size() - 1);
+	string name = toks[0];
+	name = name.substr(0, name.size() - 1);

-    vector<float> weights(toks.size() - 1);
-    for (size_t i = 1; i < toks.size(); ++i) {
-      float weight = Scan<float>(toks[i]);
-      weights[i - 1] = weight;
-    }
-    m_weights[name] = weights;
+	vector<float> weights(toks.size() - 1);
+	for (size_t i = 1; i < toks.size(); ++i) {
+	  float weight = Scan<float>(toks[i]);
+	  weights[i - 1] = weight;
+	}
+	m_weights[name] = weights;
  }
-
 }

 void Parameter::WeightOverwrite()
--- a/moses/Parameter.h
+++ b/moses/Parameter.h
@ -76,6 +76,7 @@ protected:
  void ConvertWeightArgsWordPenalty();
  void ConvertPhrasePenalty();
  void CreateWeightsMap();
+  void CreateWeightsMap(const PARAM_VEC &vec);
  void WeightOverwrite();
  void AddFeature(const std::string &line);
  void AddFeaturesCmd();
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -633,6 +633,7 @@ sub check_producability {
 	return 1 if defined($CONFIG{$out});

 	# find defined step that produces this
+	$out =~ s/:.+:/:/g;
 	my $defined_step;
 	foreach my $ds (keys %STEP_OUT) {
 	    my ($ds_module) = &deconstruct_name($ds);
--- a/scripts/tokenizer/remove-non-printing-char.perl
+++ b/scripts/tokenizer/remove-non-printing-char.perl
@ -0,0 +1,18 @@
+#!/usr/bin/perl
+
+use utf8; 
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+while (my $line = <STDIN>) {
+  chomp($line);
+  #$line =~ tr/\040-\176/ /c;
+  #$line =~ s/[^[:print:]]/ /g;
+  #$line =~ s/\s+/ /g; 
+  $line =~ s/\p{C}/ /g; 
+
+  print "$line\n";
+}
+