From c27cbf55eacd4c72685507b9bab624437d9adb4b Mon Sep 17 00:00:00 2001
From: Matthias Huck <huck@i6.informatik.rwth-aachen.de>
Date: Thu, 7 Aug 2014 21:02:51 +0100
Subject: [PATCH] source labels: integration into EMS

---
 moses/PP/Factory.cpp                        |   2 +
 moses/PP/SourceLabelsPhraseProperty.cpp     |  22 +--
 phrase-extract/PropertiesConsolidator.cpp   | 159 ++++++++++++++++++++
 phrase-extract/PropertiesConsolidator.h     |  48 ++++++
 phrase-extract/consolidate-main.cpp         |  37 ++++-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp |  15 +-
 scripts/ems/experiment.perl                 |  19 ++-
 scripts/generic/extract-parallel.perl       |   4 +-
 scripts/generic/score-parallel.perl         |  20 +++
 scripts/training/mert-moses.pl              |  18 ++-
 scripts/training/train-model.perl           |  21 ++-
 11 files changed, 324 insertions(+), 41 deletions(-)
 create mode 100644 phrase-extract/PropertiesConsolidator.cpp
 create mode 100644 phrase-extract/PropertiesConsolidator.h
diff --git a/moses/PP/Factory.cpp b/moses/PP/Factory.cpp
index 4e9bfbf0e..fd146005b 100644
--- a/moses/PP/Factory.cpp
+++ b/moses/PP/Factory.cpp
@@ -9,6 +9,7 @@
 #include "moses/PP/TreeStructurePhraseProperty.h"
 #include "moses/PP/SpanLengthPhraseProperty.h"
 #include "moses/PP/NonTermContextProperty.h"
+#include "moses/PP/OrientationPhraseProperty.h"
 
 namespace Moses
 {
@@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
   MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
   MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
   MOSES_PNAME2("NonTermContext", NonTermContextProperty);
+  MOSES_PNAME2("Orientation", OrientationPhraseProperty);
 }
 
 PhrasePropertyFactory::~PhrasePropertyFactory()
diff --git a/moses/PP/SourceLabelsPhraseProperty.cpp b/moses/PP/SourceLabelsPhraseProperty.cpp
index bca5c9a30..8e6a5dd6d 100644
--- a/moses/PP/SourceLabelsPhraseProperty.cpp
+++ b/moses/PP/SourceLabelsPhraseProperty.cpp
@@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
   std::istringstream tokenizer(value);
 
   if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
-    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?");
+    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
   }
   assert( m_nNTs > 0 );
 
   if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
-    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?");
+    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
   }
   assert( m_totalCount > 0.0 );
 
@@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
   std::priority_queue<float> ruleLabelledCountsPQ;
 
   while (tokenizer.peek() != EOF) {
-    try {
+//    try {
 
       SourceLabelsPhrasePropertyItem item;
       size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
@@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
         for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
           size_t sourceLabelRHS;
           if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
-            UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?");
+            UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
           }
           item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
         }
 
         if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
         }
 
         if (! (tokenizer >> numberOfLHSsGivenRHS)) {
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
         }
       }
 
       for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
         size_t sourceLabelLHS;
         if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
         }
         float ruleSourceLabelledCount;
         if (! (tokenizer >> ruleSourceLabelledCount)) {
-          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?");
+          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
         }
         item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
         ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
@@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
 
       m_sourceLabelItems.push_back(item);
 
-    } catch (const std::exception &e) {
-      UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
-    }
+//    } catch (const std::exception &e) {
+//      UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
+//    }
   }
 
   // keep only top N label vectors
diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp
new file mode 100644
index 000000000..642c48672
--- /dev/null
+++ b/phrase-extract/PropertiesConsolidator.cpp
@@ -0,0 +1,159 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include "PropertiesConsolidator.h"
+
+#include <sstream>
+#include <limits>
+#include <vector>
+
+#include "moses/Util.h"
+#include "phrase-extract/InputFileStream.h"
+#include "phrase-extract/OutputFileStream.h"
+
+
+namespace MosesTraining
+{
+
+void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile) 
+{
+  Moses::InputFileStream inFile(sourceLabelSetFile);
+
+  // read source label set
+  m_sourceLabels.clear();
+  std::string line;
+  while (getline(inFile, line)) {
+    std::istringstream tokenizer(line);
+    std::string label;
+    size_t index;
+    try {
+      tokenizer >> label >> index;
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
+    }
+    std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
+    UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
+  }
+
+  inFile.Close();
+
+  m_sourceLabelsFlag = true;
+}
+
+
+std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const 
+{
+  if ( propertiesString.empty() ) {
+    return propertiesString;
+  }
+
+  std::ostringstream out;
+  std::vector<std::string> toks;
+  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::string &tok = toks[i];
+    if (tok.empty()) {
+      continue;
+    }
+    size_t endPos = tok.rfind("}");
+    tok = tok.substr(0, endPos - 1);
+    std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+    assert(keyValue.size() == 2);
+
+    if ( !keyValue[0].compare("SourceLabels") ) {
+
+      if ( m_sourceLabelsFlag ) {
+
+        // SourceLabels additional property: replace strings with vocabulary indices
+        out << " {{" << keyValue[0];
+
+        std::istringstream tokenizer(keyValue[1]);
+
+        size_t nNTs;
+        double totalCount;
+
+        if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
+          UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. " 
+                      << "Flawed SourceLabels property?");
+        }
+        assert( nNTs > 0 );
+        out << " " << nNTs;
+
+        if (! (tokenizer >> totalCount)) { // second token: overall rule count
+          UTIL_THROW2("Not able to read overall rule count from SourceLabels property. " 
+                      << "Flawed SourceLabels property?");
+        }
+        assert( totalCount > 0.0 );
+        out << " " << totalCount;
+
+        while (tokenizer.peek() != EOF) {
+          try {
+
+            size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
+
+            std::string token;
+
+            if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+              for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
+                tokenizer >> token; // RHS source non-terminal label
+                std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
+                UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
+                out << " " << found->second;
+              }
+
+              tokenizer >> token; // sourceLabelsRHSCount
+              out << " " << token;
+
+              tokenizer >> numberOfLHSsGivenRHS;
+              out << " " << numberOfLHSsGivenRHS;
+            }
+
+            for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
+              tokenizer >> token; // LHS source non-terminal label
+              std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
+              UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
+              out << " " << found->second;
+
+              tokenizer >> token; // ruleSourceLabelledCount
+              out << " " << token;
+            }
+
+          } catch (const std::exception &e) {
+            UTIL_THROW2("Flawed item in SourceLabels property?");
+          }
+        }
+
+        out << "}}";
+
+      } else { // don't process source labels additional property
+        out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+      }
+
+    } else {
+
+      // output other additional property
+      out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+    }
+  }
+
+  return out.str();
+}
+
+}  // namespace MosesTraining
+
diff --git a/phrase-extract/PropertiesConsolidator.h b/phrase-extract/PropertiesConsolidator.h
new file mode 100644
index 000000000..cc6a7a835
--- /dev/null
+++ b/phrase-extract/PropertiesConsolidator.h
@@ -0,0 +1,48 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+
+#pragma once
+
+#include <string>
+#include <map>
+
+
+namespace MosesTraining
+{
+
+class PropertiesConsolidator
+{
+public:
+
+  PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
+
+  void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
+
+  std::string ProcessPropertiesString(const std::string &propertiesString) const;
+
+private:
+
+  bool m_sourceLabelsFlag;
+  std::map<std::string,size_t> m_sourceLabels;
+
+};
+
+}  // namespace MosesTraining
+
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index a2174805c..10697a956 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -28,6 +28,7 @@
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
+#include "PropertiesConsolidator.h"
 
 using namespace std;
 
@@ -37,13 +38,14 @@ bool phraseCountFlag = false;
 bool lowCountFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
+bool sourceLabelsFlag = false;
 bool logProbFlag = false;
 inline float maybeLogProb( float a )
 {
   return logProbFlag ? log(a) : a;
 }
 
-void processFiles( char*, char*, char*, char* );
+void processFiles( char*, char*, char*, char*, char* );
 void loadCountOfCounts( char* );
 void breakdownCoreAndSparse( string combined, string &core, string &sparse );
 bool getLine( istream &fileP, vector< string > &item );
@@ -57,13 +59,14 @@ int main(int argc, char* argv[])
        << "consolidating direct and indirect rule tables\n";
 
   if (argc < 4) {
-    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n";
+    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
     exit(1);
   }
   char* &fileNameDirect = argv[1];
   char* &fileNameIndirect = argv[2];
   char* &fileNameConsolidated = argv[3];
   char* fileNameCountOfCounts;
+  char* fileNameSourceLabelSet;
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"--Hierarchical") == 0) {
@@ -114,13 +117,21 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
       cerr << "using log-probabilities\n";
+    } else if (strcmp(argv[i],"--SourceLabels") == 0) {
+      sourceLabelsFlag = true;
+      if (i+1==argc) {
+        cerr << "ERROR: specify source label set file!\n";
+        exit(1);
+      }
+      fileNameSourceLabelSet = argv[++i];
+      cerr << "processing source labels property\n";
     } else {
       cerr << "ERROR: unknown option " << argv[i] << endl;
       exit(1);
     }
   }
 
-  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
+  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
 }
 
 vector< float > countOfCounts;
@@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
   if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
 }
 
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
 {
   if (goodTuringFlag || kneserNeyFlag)
     loadCountOfCounts( fileNameCountOfCounts );
@@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     exit(1);
   }
 
+  // create properties consolidator 
+  // (in case any additional phrase property requires further processing)
+  MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
+  if (sourceLabelsFlag) {
+    propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
+  }
+
   // loop through all extracted phrase translations
   int i=0;
   while(true) {
@@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     // counts, for debugging
     fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
 
-    // count bin feature (as a sparse feature)
+    // sparse features
     fileConsolidated << " |||";
     if (directSparseScores.compare("") != 0)
       fileConsolidated << " " << directSparseScores;
     if (indirectSparseScores.compare("") != 0)
       fileConsolidated << " " << indirectSparseScores;
+    // count bin feature (as a sparse feature)
     if (sparseCountBinFeatureFlag) {
       bool foundBin = false;
       for(size_t i=0; i < countBin.size(); i++) {
@@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     }
 
     // arbitrary key-value pairs
-    fileConsolidated << " ||| ";
+    fileConsolidated << " |||";
     if (itemDirect.size() >= 6) {
-      fileConsolidated << itemDirect[5];
+      //if (sourceLabelsFlag) {
+        fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
+      //} else {
+      //  fileConsolidated << itemDirect[5];
+      //}
     }
 
     fileConsolidated << endl;
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 36dfee2e5..774416079 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar(
     }
   }
 
-  std::string sourceTopLabel = "TOPLABEL";
-  std::string sourceSLabel = "S";
-  std::string sourceSomeLabel = "SOMELABEL";
+  size_t sourceLabelGlueTop = 0;
+  size_t sourceLabelGlueX = 1;
 
   // basic rules
   out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
@@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar(
     out << " {{Tree [" << topLabel << " <s>]}}";
   }
   if (options.sourceLabels) {
-    out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}";
+    out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
   }
   out << std::endl;
 
@@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar(
     out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
   }
   if (options.sourceLabels) {
-    out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}";
+    out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
   }
   out << std::endl;
 
@@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar(
       out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
     }
     if (options.sourceLabels) {
-      out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}";
+      out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
     }
     out << std::endl;
   }
@@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar(
       out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
     }
     if (options.sourceLabels) {
-      out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" 
+      out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" 
     }
     out << std::endl;
   }
@@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar(
     out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
   }
   if (options.sourceLabels) {
-    out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
+    out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
   }
   out << std::endl;
 }
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 1108bec1b..59c064224 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -1860,7 +1860,7 @@ sub define_tuning_tune {
 	$cmd .= " --lambdas \"$lambda\"" if $lambda;
 	$cmd .= " --continue" if $tune_continue;
 	$cmd .= " --skip-decoder" if $skip_decoder;
-	$cmd .= " --inputtype $tune_inputtype" if $tune_inputtype;
+	$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
     
 	my $qsub_args = &get_qsub_args("TUNING");
 	$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
@@ -2217,6 +2217,10 @@ sub define_training_extract_phrases {
         my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
         $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
       }
+
+      if (&get("TRAINING:ghkm-source-labels")) {
+        $cmd .= "-ghkm-source-labels ";
+      }
     }
 
     my $extract_settings = &get("TRAINING:extract-settings");
@@ -2254,6 +2258,11 @@ sub define_training_build_ttable {
         my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
         $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
       }
+      if (&get("TRAINING:ghkm-source-labels")) {
+        $cmd .= "-ghkm-source-labels ";
+        my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
+        $cmd .= "-ghkm-source-labels-file $source_labels_file ";
+      }
     }
     
     &create_step($step_id,$cmd);
@@ -2438,6 +2447,12 @@ sub define_training_create_config {
       }
     }
 
+    if (&get("TRAINING:ghkm-source-labels")) {
+      $cmd .= "-ghkm-source-labels ";
+      my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
+      $cmd .= "-ghkm-source-labels-file $source_labels_file ";
+    }
+
     # sparse lexical features provide additional content for config file
     $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
 
@@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array {
 # the following two functions deal with getting information about
 # files that are passed between steps. this are either specified
 # in the meta file (default) or in the configuration file (here called
-# 'specified', in the step management refered to as 'given').
+# 'specified', in the step management referred to as 'given').
 
 sub get_specified_or_default_file {
     my ($specified_module,$specified_set,$specified_parameter,
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index ff6a058b5..433e95b9d 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -219,14 +219,14 @@ foreach (@children) {
 	waitpid($_, 0);
 }
 
-# glue rules
+# merge glue rules
 if (defined($glueFile)) {
   my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
   print STDERR "Merging glue rules: $cmd \n";
   print STDERR `$cmd`;
 }
 
-# phrase orientation priors (GHKM extraction)
+# merge phrase orientation priors (GHKM extraction)
 if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
   print STDERR "Merging phrase orientation priors\n";
 
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index 213b9e90e..7835d3826 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -27,10 +27,22 @@ my $scoreCmd		= $ARGV[2];
 my $extractFile = $ARGV[3]; # 1st arg of extract argument
 my $lexFile 		= $ARGV[4]; 
 my $ptHalf 			= $ARGV[5]; # output
+my $inverse = 0;
+my $sourceLabelsFile;
 
 my $otherExtractArgs= "";
 for (my $i = 6; $i < $#ARGV; ++$i)
 {
+  if ($ARGV[$i] eq '--SourceLabels') {
+    $sourceLabelsFile = $ARGV[++$i];
+    $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
+    next;
+  }
+  if ($ARGV[$i] eq '--Inverse') {
+    $inverse = 1;
+    $otherExtractArgs .= $ARGV[$i] ." ";
+    next;
+  }
   $otherExtractArgs .= $ARGV[$i] ." ";
 }
 #$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
@@ -258,6 +270,14 @@ if (-e $cocPath)
   close(FHCOC);
 }
 
+# merge source label files
+if (!$inverse && defined($sourceLabelsFile)) 
+{
+  my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
+  print STDERR "Merging source label files: $cmd \n";
+  `$cmd`;
+}
+
 $cmd = "rm -rf $TMPDIR \n";
 print STDERR $cmd;
 systemCheck($cmd);
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index dd41538f5..b965fbdd5 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -127,8 +127,8 @@ my $___NOCASE = 0;
 # Use "--nonorm" to non normalize translation before computing scores
 my $___NONORM = 0;
 
-# set 0 if input type is text, set 1 if input type is confusion network
-my $___INPUTTYPE = 0;
+# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
+my $___INPUTTYPE;
 
 
 my $mertdir = undef; # path to new mert directory
@@ -1228,14 +1228,18 @@ sub run_decoder {
 
     if (defined $___JOBS && $___JOBS > 0) {
       die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
-      $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
+      $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
+      $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); 
+      $decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
     } else {
-      my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE";
+      my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
       if ($___HG_MIRA) {
         safesystem("rm -rf $hypergraph_dir");
         $nbest_list_cmd = "-output-search-graph-hypergraph true gz";
       }
-      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
+      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG";
+      $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
+      $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
     }
 
     print STDERR "Executing: $decoder_cmd \n";
@@ -1309,7 +1313,9 @@ sub get_featlist_from_moses {
     print STDERR "Using cached features list: $featlistfn\n";
   } else {
     print STDERR "Asking moses for feature names and values from $___CONFIG\n";
-    my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn  -inputtype $___INPUTTYPE -show-weights > $featlistfn";
+    my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
+    $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
+    $cmd .= " -show-weights > $featlistfn";
     print STDERR "Executing: $cmd\n";
     safesystem($cmd) or die "Failed to run moses with the config $configfn";
   }
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index da8e677bc..8f661b812 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
    $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
    @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
    $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
-   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
+   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
    $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
    $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@@ -112,6 +112,8 @@ $_HELP = 1
 		       'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
 		       'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
 		       'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
+               'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
+               'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
 		       'pcfg' => \$_PCFG,
 		       'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
 		       'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@@ -1427,10 +1429,15 @@ sub extract_phrase {
         $cmd .= " --PCFG" if $_PCFG;
         $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
         $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
-        $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
-        $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
-        $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
-        if (!defined($_GHKM)) {
+        if (defined($_GHKM)) 
+        {
+          $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
+          $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
+          $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
+          $cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
+        }
+        else
+        {
           $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
           $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
           $cmd .= " --MaxSpan $max_length";
@@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract {
         $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
         $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
         $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
+        $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
         $cmd .= " $DOMAIN" if $DOMAIN;
         $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
         $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
@@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract {
     $cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
     $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
     $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
+    $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
     
     $cmd .= " | gzip -c > $ttable_file.gz";
     
@@ -2164,6 +2173,7 @@ sub create_ini {
   print INI "WordPenalty\n";
   print INI "PhrasePenalty\n";
   print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
+  print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
   print INI $feature_spec;
 
   print INI "\n# dense weights for feature functions\n";
@@ -2171,6 +2181,7 @@ sub create_ini {
   print INI "UnknownWordPenalty0= 1\n";
   print INI "WordPenalty0= -1\n";
   print INI "PhrasePenalty0= 0.2\n";
+  print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
   print INI $weight_spec;
   close(INI);
 }