source labels: integration into EMS

2024-12-26 05:14:36 +03:00 · 2014-08-07 21:02:51 +01:00 · 2014-08-07 21:02:51 +01:00 · c27cbf55ea
commit c27cbf55ea
parent cda9d1d5ae
11 changed files with 324 additions and 41 deletions
--- a/moses/PP/Factory.cpp
+++ b/moses/PP/Factory.cpp
@ -9,6 +9,7 @@
 #include "moses/PP/TreeStructurePhraseProperty.h"
 #include "moses/PP/SpanLengthPhraseProperty.h"
 #include "moses/PP/NonTermContextProperty.h"
 #include "moses/PP/OrientationPhraseProperty.h"
 namespace Moses
 {
@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
  MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
  MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
  MOSES_PNAME2("NonTermContext", NonTermContextProperty);
  MOSES_PNAME2("Orientation", OrientationPhraseProperty);
 }
 PhrasePropertyFactory::~PhrasePropertyFactory()
--- a/moses/PP/SourceLabelsPhraseProperty.cpp
+++ b/moses/PP/SourceLabelsPhraseProperty.cpp
@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
  std::istringstream tokenizer(value);
  if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
-    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?");
+    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
  }
  assert( m_nNTs > 0 );
  if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
-    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?");
+    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
  }
  assert( m_totalCount > 0.0 );
@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
  std::priority_queue<float> ruleLabelledCountsPQ;
  while (tokenizer.peek() != EOF) {
-    try {
+//    try {
      SourceLabelsPhrasePropertyItem item;
      size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
        for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
          size_t sourceLabelRHS;
          if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
-            UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?");
+            UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
          }
          item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
        }
        if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
        }
        if (! (tokenizer >> numberOfLHSsGivenRHS)) {
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
        }
      }
      for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
        size_t sourceLabelLHS;
        if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
        }
        float ruleSourceLabelledCount;
        if (! (tokenizer >> ruleSourceLabelledCount)) {
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
        }
        item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
        ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
      m_sourceLabelItems.push_back(item);
-    } catch (const std::exception &e) {
+//    } catch (const std::exception &e) {
-      UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
+//      UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
-    }
+//    }
  }
  // keep only top N label vectors
--- a/phrase-extract/PropertiesConsolidator.cpp
+++ b/phrase-extract/PropertiesConsolidator.cpp
@ -0,0 +1,159 @@
 /***********************************************************************
  Moses - factored phrase-based language decoder
  Copyright (C) University of Edinburgh
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/
 #include "PropertiesConsolidator.h"
 #include <sstream>
 #include <limits>
 #include <vector>
 #include "moses/Util.h"
 #include "phrase-extract/InputFileStream.h"
 #include "phrase-extract/OutputFileStream.h"
 namespace MosesTraining
 {
 void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile) 
 {
  Moses::InputFileStream inFile(sourceLabelSetFile);
  // read source label set
  m_sourceLabels.clear();
  std::string line;
  while (getline(inFile, line)) {
    std::istringstream tokenizer(line);
    std::string label;
    size_t index;
    try {
      tokenizer >> label >> index;
    } catch (const std::exception &e) {
      UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
    }
    std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
    UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
  }
  inFile.Close();
  m_sourceLabelsFlag = true;
 }
 std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const 
 {
  if ( propertiesString.empty() ) {
    return propertiesString;
  }
  std::ostringstream out;
  std::vector<std::string> toks;
  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
  for (size_t i = 1; i < toks.size(); ++i) {
    std::string &tok = toks[i];
    if (tok.empty()) {
      continue;
    }
    size_t endPos = tok.rfind("}");
    tok = tok.substr(0, endPos - 1);
    std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
    assert(keyValue.size() == 2);
    if ( !keyValue[0].compare("SourceLabels") ) {
      if ( m_sourceLabelsFlag ) {
        // SourceLabels additional property: replace strings with vocabulary indices
        out << " {{" << keyValue[0];
        std::istringstream tokenizer(keyValue[1]);
        size_t nNTs;
        double totalCount;
        if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
          UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. " 
                      << "Flawed SourceLabels property?");
        }
        assert( nNTs > 0 );
        out << " " << nNTs;
        if (! (tokenizer >> totalCount)) { // second token: overall rule count
          UTIL_THROW2("Not able to read overall rule count from SourceLabels property. " 
                      << "Flawed SourceLabels property?");
        }
        assert( totalCount > 0.0 );
        out << " " << totalCount;
        while (tokenizer.peek() != EOF) {
          try {
            size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
            std::string token;
            if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
              for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
                tokenizer >> token; // RHS source non-terminal label
                std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
                UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
                out << " " << found->second;
              }
              tokenizer >> token; // sourceLabelsRHSCount
              out << " " << token;
              tokenizer >> numberOfLHSsGivenRHS;
              out << " " << numberOfLHSsGivenRHS;
            }
            for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
              tokenizer >> token; // LHS source non-terminal label
              std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
              UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
              out << " " << found->second;
              tokenizer >> token; // ruleSourceLabelledCount
              out << " " << token;
            }
          } catch (const std::exception &e) {
            UTIL_THROW2("Flawed item in SourceLabels property?");
          }
        }
        out << "}}";
      } else { // don't process source labels additional property
        out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
      }
    } else {
      // output other additional property
      out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
    }
  }
  return out.str();
 }
 }  // namespace MosesTraining
--- a/phrase-extract/PropertiesConsolidator.h
+++ b/phrase-extract/PropertiesConsolidator.h
@ -0,0 +1,48 @@
 /***********************************************************************
  Moses - factored phrase-based language decoder
  Copyright (C) University of Edinburgh
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/
 #pragma once
 #include <string>
 #include <map>
 namespace MosesTraining
 {
 class PropertiesConsolidator
 {
 public:
  PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
  void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
  std::string ProcessPropertiesString(const std::string &propertiesString) const;
 private:
  bool m_sourceLabelsFlag;
  std::map<std::string,size_t> m_sourceLabels;
 };
 }  // namespace MosesTraining
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@ -28,6 +28,7 @@
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 #include "PropertiesConsolidator.h"
 using namespace std;
@ -37,13 +38,14 @@ bool phraseCountFlag = false;
 bool lowCountFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
 bool sourceLabelsFlag = false;
 bool logProbFlag = false;
 inline float maybeLogProb( float a )
 {
  return logProbFlag ? log(a) : a;
 }
-void processFiles( char*, char*, char*, char* );
+void processFiles( char*, char*, char*, char*, char* );
 void loadCountOfCounts( char* );
 void breakdownCoreAndSparse( string combined, string &core, string &sparse );
 bool getLine( istream &fileP, vector< string > &item );
@ -57,13 +59,14 @@ int main(int argc, char* argv[])
       << "consolidating direct and indirect rule tables\n";
  if (argc < 4) {
-    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n";
+    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
    exit(1);
  }
  char* &fileNameDirect = argv[1];
  char* &fileNameIndirect = argv[2];
  char* &fileNameConsolidated = argv[3];
  char* fileNameCountOfCounts;
  char* fileNameSourceLabelSet;
  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -114,13 +117,21 @@ int main(int argc, char* argv[])
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      cerr << "using log-probabilities\n";
    } else if (strcmp(argv[i],"--SourceLabels") == 0) {
      sourceLabelsFlag = true;
      if (i+1==argc) {
        cerr << "ERROR: specify source label set file!\n";
        exit(1);
      }
      fileNameSourceLabelSet = argv[++i];
      cerr << "processing source labels property\n";
    } else {
      cerr << "ERROR: unknown option " << argv[i] << endl;
      exit(1);
    }
  }
-  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
+  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
 }
 vector< float > countOfCounts;
@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
  if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
 }
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
 {
  if (goodTuringFlag || kneserNeyFlag)
    loadCountOfCounts( fileNameCountOfCounts );
@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    exit(1);
  }
  // create properties consolidator 
  // (in case any additional phrase property requires further processing)
  MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
  if (sourceLabelsFlag) {
    propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
  }
  // loop through all extracted phrase translations
  int i=0;
  while(true) {
@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    // counts, for debugging
    fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
-    // count bin feature (as a sparse feature)
+    // sparse features
    fileConsolidated << " |||";
    if (directSparseScores.compare("") != 0)
      fileConsolidated << " " << directSparseScores;
    if (indirectSparseScores.compare("") != 0)
      fileConsolidated << " " << indirectSparseScores;
    // count bin feature (as a sparse feature)
    if (sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    }
    // arbitrary key-value pairs
-    fileConsolidated << " ||| ";
+    fileConsolidated << " |||";
    if (itemDirect.size() >= 6) {
-      fileConsolidated << itemDirect[5];
+      //if (sourceLabelsFlag) {
        fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
      //} else {
      //  fileConsolidated << itemDirect[5];
      //}
    }
    fileConsolidated << endl;
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar(
    }
  }
-  std::string sourceTopLabel = "TOPLABEL";
+  size_t sourceLabelGlueTop = 0;
-  std::string sourceSLabel = "S";
+  size_t sourceLabelGlueX = 1;
  std::string sourceSomeLabel = "SOMELABEL";
  // basic rules
  out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar(
    out << " {{Tree [" << topLabel << " <s>]}}";
  }
  if (options.sourceLabels) {
-    out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}";
+    out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
  }
  out << std::endl;
@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar(
    out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
  }
  if (options.sourceLabels) {
-    out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}";
+    out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
  }
  out << std::endl;
@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar(
      out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
    }
    if (options.sourceLabels) {
-      out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}";
+      out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
    }
    out << std::endl;
  }
@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar(
      out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
    }
    if (options.sourceLabels) {
-      out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" 
+      out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" 
    }
    out << std::endl;
  }
@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar(
    out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
  }
  if (options.sourceLabels) {
-    out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
+    out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
  }
  out << std::endl;
 }
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -1860,7 +1860,7 @@ sub define_tuning_tune {
 	$cmd .= " --lambdas \"$lambda\"" if $lambda;
 	$cmd .= " --continue" if $tune_continue;
 	$cmd .= " --skip-decoder" if $skip_decoder;
-	$cmd .= " --inputtype $tune_inputtype" if $tune_inputtype;
+	$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
 	my $qsub_args = &get_qsub_args("TUNING");
 	$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
@ -2217,6 +2217,10 @@ sub define_training_extract_phrases {
        my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
        $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
      }
      if (&get("TRAINING:ghkm-source-labels")) {
        $cmd .= "-ghkm-source-labels ";
      }
    }
    my $extract_settings = &get("TRAINING:extract-settings");
@ -2254,6 +2258,11 @@ sub define_training_build_ttable {
        my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
        $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
      }
      if (&get("TRAINING:ghkm-source-labels")) {
        $cmd .= "-ghkm-source-labels ";
        my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
        $cmd .= "-ghkm-source-labels-file $source_labels_file ";
      }
    }
    &create_step($step_id,$cmd);
@ -2438,6 +2447,12 @@ sub define_training_create_config {
      }
    }
    if (&get("TRAINING:ghkm-source-labels")) {
      $cmd .= "-ghkm-source-labels ";
      my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
      $cmd .= "-ghkm-source-labels-file $source_labels_file ";
    }
    # sparse lexical features provide additional content for config file
    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array {
 # the following two functions deal with getting information about
 # files that are passed between steps. this are either specified
 # in the meta file (default) or in the configuration file (here called
-# 'specified', in the step management refered to as 'given').
+# 'specified', in the step management referred to as 'given').
 sub get_specified_or_default_file {
    my ($specified_module,$specified_set,$specified_parameter,
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@ -219,14 +219,14 @@ foreach (@children) {
 	waitpid($_, 0);
 }
-# glue rules
+# merge glue rules
 if (defined($glueFile)) {
  my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
  print STDERR "Merging glue rules: $cmd \n";
  print STDERR `$cmd`;
 }
-# phrase orientation priors (GHKM extraction)
+# merge phrase orientation priors (GHKM extraction)
 if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
  print STDERR "Merging phrase orientation priors\n";
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@ -27,10 +27,22 @@ my $scoreCmd		= $ARGV[2];
 my $extractFile = $ARGV[3]; # 1st arg of extract argument
 my $lexFile 		= $ARGV[4]; 
 my $ptHalf 			= $ARGV[5]; # output
 my $inverse = 0;
 my $sourceLabelsFile;
 my $otherExtractArgs= "";
 for (my $i = 6; $i < $#ARGV; ++$i)
 {
  if ($ARGV[$i] eq '--SourceLabels') {
    $sourceLabelsFile = $ARGV[++$i];
    $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
    next;
  }
  if ($ARGV[$i] eq '--Inverse') {
    $inverse = 1;
    $otherExtractArgs .= $ARGV[$i] ." ";
    next;
  }
  $otherExtractArgs .= $ARGV[$i] ." ";
 }
 #$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
@ -258,6 +270,14 @@ if (-e $cocPath)
  close(FHCOC);
 }
 # merge source label files
 if (!$inverse && defined($sourceLabelsFile)) 
 {
  my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
  print STDERR "Merging source label files: $cmd \n";
  `$cmd`;
 }
 $cmd = "rm -rf $TMPDIR \n";
 print STDERR $cmd;
 systemCheck($cmd);
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@ -127,8 +127,8 @@ my $___NOCASE = 0;
 # Use "--nonorm" to non normalize translation before computing scores
 my $___NONORM = 0;
-# set 0 if input type is text, set 1 if input type is confusion network
+# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
-my $___INPUTTYPE = 0;
+my $___INPUTTYPE;
 my $mertdir = undef; # path to new mert directory
@ -1228,14 +1228,18 @@ sub run_decoder {
    if (defined $___JOBS && $___JOBS > 0) {
      die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
-      $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
+      $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
      $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); 
      $decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
    } else {
-      my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE";
+      my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
      if ($___HG_MIRA) {
        safesystem("rm -rf $hypergraph_dir");
        $nbest_list_cmd = "-output-search-graph-hypergraph true gz";
      }
-      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
+      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG";
      $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
      $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
    }
    print STDERR "Executing: $decoder_cmd \n";
@ -1309,7 +1313,9 @@ sub get_featlist_from_moses {
    print STDERR "Using cached features list: $featlistfn\n";
  } else {
    print STDERR "Asking moses for feature names and values from $___CONFIG\n";
-    my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn  -inputtype $___INPUTTYPE -show-weights > $featlistfn";
+    my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
    $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
    $cmd .= " -show-weights > $featlistfn";
    print STDERR "Executing: $cmd\n";
    safesystem($cmd) or die "Failed to run moses with the config $configfn";
  }
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
   $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
   @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
-   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
+   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
   $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
   $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
   $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@ -112,6 +112,8 @@ $_HELP = 1
 		       'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
 		       'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
 		       'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
               'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
               'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
 		       'pcfg' => \$_PCFG,
 		       'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
 		       'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@ -1427,10 +1429,15 @@ sub extract_phrase {
        $cmd .= " --PCFG" if $_PCFG;
        $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
        $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
-        $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
+        if (defined($_GHKM)) 
-        $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
+        {
-        $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
+          $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
-        if (!defined($_GHKM)) {
+          $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
          $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
          $cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
        }
        else
        {
          $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
          $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
          $cmd .= " --MaxSpan $max_length";
@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract {
        $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
        $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
        $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
        $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
        $cmd .= " $DOMAIN" if $DOMAIN;
        $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
        $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract {
    $cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
    $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
    $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
    $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
    $cmd .= " | gzip -c > $ttable_file.gz";
@ -2164,6 +2173,7 @@ sub create_ini {
  print INI "WordPenalty\n";
  print INI "PhrasePenalty\n";
  print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
  print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
  print INI $feature_spec;
  print INI "\n# dense weights for feature functions\n";
@ -2171,6 +2181,7 @@ sub create_ini {
  print INI "UnknownWordPenalty0= 1\n";
  print INI "WordPenalty0= -1\n";
  print INI "PhrasePenalty0= 0.2\n";
  print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
  print INI $weight_spec;
  close(INI);
 }