From ac51e9f0a86314e430e5c1f95e3b8b5b91a2818e Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 18 Mar 2015 09:56:46 +0000
Subject: [PATCH 1/4] Always use "SyntaxInputWeight0" as name of
 SyntaxInputWeight feature

---
 scripts/training/train-model.perl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 265847c3d..f92e545be 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -2153,8 +2153,8 @@ sub create_ini {
 
    # SyntaxInputWeight FF
    if ($_USE_SYNTAX_INPUT_WEIGHT_FEATURE) {
-     $feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight$i\n";
-     $weight_spec .= "SyntaxInputWeight$i= 0.1\n";
+     $feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight0\n";
+     $weight_spec .= "SyntaxInputWeight0= 0.1\n";
    }
 
    # generation model

From fc15e03ebe42d2e15c8be53eeda5b795fffb6e1e Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 18 Mar 2015 09:57:42 +0000
Subject: [PATCH 2/4] Replace truecase-egret.sh with more general
 tree-converter-wrapper.perl

---
 .../ems/support/tree-converter-wrapper.perl   | 21 +++++++++++++++++++
 scripts/recaser/truecase-egret.sh             |  7 -------
 2 files changed, 21 insertions(+), 7 deletions(-)
 create mode 100755 scripts/ems/support/tree-converter-wrapper.perl
 delete mode 100755 scripts/recaser/truecase-egret.sh

diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl
new file mode 100755
index 000000000..aae55991a
--- /dev/null
+++ b/scripts/ems/support/tree-converter-wrapper.perl
@@ -0,0 +1,21 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use utf8;
+use Getopt::Long "GetOptions";
+
+Getopt::Long::config("pass_through");
+
+my ($BIN,$MODEL);
+
+&GetOptions('bin=s' => \$BIN,
+            'model=s' => \$MODEL);  # À la truecase.perl
+
+die("ERROR: specify at least --bin BIN!") unless defined($BIN);
+
+my $cmd = "$BIN";
+$cmd .= " -case true:model=$MODEL" if defined($MODEL);
+$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV);  # Pass other args to $BIN.
+
+system $cmd;
diff --git a/scripts/recaser/truecase-egret.sh b/scripts/recaser/truecase-egret.sh
deleted file mode 100755
index 137d27e7d..000000000
--- a/scripts/recaser/truecase-egret.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env sh
-
-$1 \
-  -input_format egret \
-  -output_format egret \
-  -no_egret_weight_normalization \
-  -case true:model=$3

From 1568afb73741cf6a464d6efe0b46e4c4c9d9327d Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Wed, 18 Mar 2015 17:36:32 +0000
Subject: [PATCH 3/4] on-the-fly unbinarization of internal tree structure (for
 translation models extracted from binarized treebanks)

---
 moses/ChartKBestExtractor.cpp     |   3 +-
 moses/FF/InternalTree.cpp         |  38 +++++++++
 moses/FF/InternalTree.h           | 128 +++++++++++++++---------------
 moses/FF/TreeStructureFeature.cpp |  18 +++++
 moses/FF/TreeStructureFeature.h   |   6 +-
 5 files changed, 127 insertions(+), 66 deletions(-)

diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp
index 60e4e7f2b..bd5d7cbcd 100644
--- a/moses/ChartKBestExtractor.cpp
+++ b/moses/ChartKBestExtractor.cpp
@@ -168,9 +168,10 @@ TreePointer ChartKBestExtractor::GetOutputTree(const Derivation &d)
     }
 
     mytree->Combine(previous_trees);
+    mytree->Unbinarize();
     return mytree;
   } else {
-    UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
+    UTIL_THROW2("Error: k-best tree output active, but no internal tree structure found");
   }
 }
 
diff --git a/moses/FF/InternalTree.cpp b/moses/FF/InternalTree.cpp
index 9e974d0cd..95730f018 100644
--- a/moses/FF/InternalTree.cpp
+++ b/moses/FF/InternalTree.cpp
@@ -115,6 +115,44 @@ void InternalTree::Combine(const std::vector<TreePointer> &previous)
   }
 }
 
+//take tree with virtual nodes (created with relax-parse --RightBinarize or --LeftBinarize) and reconstruct original tree.
+void InternalTree::Unbinarize()
+{
+
+  // nodes with virtual label cannot be unbinarized
+  if (m_value.empty() || m_value[0] == '^') {
+    return;
+  }
+
+  //if node has child that is virtual node, get unbinarized list of children
+  for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
+    if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') {
+      std::vector<TreePointer> new_children;
+      GetUnbinarizedChildren(new_children);
+      m_children = new_children;
+      break;
+    }
+  }
+
+  //recursion
+  for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
+    (*it)->Unbinarize();
+  }
+}
+
+//get the children of a node in a binarized tree; if a child is virtual, (transitively) replace it with its children
+void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
+{
+  for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) {
+    const std::string &label = (*itx)->GetLabel();
+    if (!label.empty() && label[0] == '^') {
+      (*itx)->GetUnbinarizedChildren(ret);
+    }
+    else {
+      ret.push_back(*itx);
+    }
+  }
+}
 
 bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
 {
diff --git a/moses/FF/InternalTree.h b/moses/FF/InternalTree.h
index 722c5832f..f9a8ba5d8 100644
--- a/moses/FF/InternalTree.h
+++ b/moses/FF/InternalTree.h
@@ -38,6 +38,8 @@ public:
 
   std::string GetString(bool start = true) const;
   void Combine(const std::vector<TreePointer> &previous);
+  void Unbinarize();
+  void GetUnbinarizedChildren(std::vector<TreePointer> &children) const;
   const std::string & GetLabel() const {
     return m_value;
   }
@@ -93,6 +95,68 @@ public:
   // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
   bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
 
+  // Python-like generator that yields next nonterminal leaf on every call
+  $generator(leafNT)
+  {
+    std::vector<TreePointer>::iterator it;
+    InternalTree* tree;
+    leafNT(InternalTree* root = 0): tree(root) {}
+    $emit(std::vector<TreePointer>::iterator)
+    for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+      if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+        $yield(it);
+      } else if ((*it)->GetLength() > 0) {
+        if ((*it).get()) { // normal pointer to same object that TreePointer points to
+          $restart(tree = (*it).get());
+        }
+      }
+    }
+    $stop;
+  };
+
+
+  // Python-like generator that yields the parent of the next nonterminal leaf on every call
+  $generator(leafNTParent)
+  {
+    std::vector<TreePointer>::iterator it;
+    InternalTree* tree;
+    leafNTParent(InternalTree* root = 0): tree(root) {}
+    $emit(InternalTree*)
+    for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+      if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+        $yield(tree);
+      } else if ((*it)->GetLength() > 0) {
+        if ((*it).get()) {
+          $restart(tree = (*it).get());
+        }
+      }
+    }
+    $stop;
+  };
+
+  // Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
+  $generator(leafNTPath)
+  {
+    std::vector<TreePointer>::iterator it;
+    InternalTree* tree;
+    std::vector<InternalTree*> * path;
+    leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
+    $emit(std::vector<TreePointer>::iterator)
+    path->push_back(tree);
+    for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+      if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+        path->push_back((*it).get());
+        $yield(it);
+        path->pop_back();
+      } else if ((*it)->GetLength() > 0) {
+        if ((*it).get()) {
+          $restart(tree = (*it).get());
+        }
+      }
+    }
+    path->pop_back();
+    $stop;
+  };
 
 };
 
@@ -113,68 +177,4 @@ public:
   };
 };
 
-// Python-like generator that yields next nonterminal leaf on every call
-$generator(leafNT)
-{
-  std::vector<TreePointer>::iterator it;
-  InternalTree* tree;
-  leafNT(InternalTree* root = 0): tree(root) {}
-  $emit(std::vector<TreePointer>::iterator)
-  for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
-    if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
-      $yield(it);
-    } else if ((*it)->GetLength() > 0) {
-      if ((*it).get()) { // normal pointer to same object that TreePointer points to
-        $restart(tree = (*it).get());
-      }
-    }
-  }
-  $stop;
-};
-
-
-// Python-like generator that yields the parent of the next nonterminal leaf on every call
-$generator(leafNTParent)
-{
-  std::vector<TreePointer>::iterator it;
-  InternalTree* tree;
-  leafNTParent(InternalTree* root = 0): tree(root) {}
-  $emit(InternalTree*)
-  for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
-    if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
-      $yield(tree);
-    } else if ((*it)->GetLength() > 0) {
-      if ((*it).get()) {
-        $restart(tree = (*it).get());
-      }
-    }
-  }
-  $stop;
-};
-
-// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
-$generator(leafNTPath)
-{
-  std::vector<TreePointer>::iterator it;
-  InternalTree* tree;
-  std::vector<InternalTree*> * path;
-  leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
-  $emit(std::vector<TreePointer>::iterator)
-  path->push_back(tree);
-  for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
-    if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
-      path->push_back((*it).get());
-      $yield(it);
-      path->pop_back();
-    } else if ((*it)->GetLength() > 0) {
-      if ((*it).get()) {
-        $restart(tree = (*it).get());
-      }
-    }
-  }
-  path->pop_back();
-  $stop;
-};
-
-
 }
\ No newline at end of file
diff --git a/moses/FF/TreeStructureFeature.cpp b/moses/FF/TreeStructureFeature.cpp
index e558b06bc..f2988f2b9 100644
--- a/moses/FF/TreeStructureFeature.cpp
+++ b/moses/FF/TreeStructureFeature.cpp
@@ -70,6 +70,11 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
     }
     mytree->Combine(previous_trees);
 
+    bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "</s>" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "</s>"));
+    if (m_binarized && full_sentence) {
+        mytree->Unbinarize();
+    }
+
     return new TreeState(mytree);
   } else {
     UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
@@ -77,4 +82,17 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
 
 }
 
+void TreeStructureFeature::SetParameter(const std::string& key, const std::string& value)
+{
+  std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
+  if (key == "tuneable") {
+    m_tuneable = Scan<bool>(value);
+  } else if (key == "filterable") { //ignore
+  } else if (key == "binarized") { // if trees have been binarized before learning translation model; output unbinarized trees
+    m_binarized = true;
+  } else {
+    UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
+  }
+}
+
 }
diff --git a/moses/FF/TreeStructureFeature.h b/moses/FF/TreeStructureFeature.h
index d5ec4edda..ecb2ce7cb 100644
--- a/moses/FF/TreeStructureFeature.h
+++ b/moses/FF/TreeStructureFeature.h
@@ -34,9 +34,11 @@ class TreeStructureFeature : public StatefulFeatureFunction
 {
   SyntaxConstraints* m_constraints;
   LabelSet* m_labelset;
+  bool m_binarized;
 public:
   TreeStructureFeature(const std::string &line)
-    :StatefulFeatureFunction(0, line) {
+    :StatefulFeatureFunction(0, line)
+    , m_binarized(false) {
     ReadParameters();
   }
   ~TreeStructureFeature() {
@@ -53,6 +55,8 @@ public:
     return true;
   }
 
+  void SetParameter(const std::string& key, const std::string& value);
+
   void EvaluateInIsolation(const Phrase &source
                            , const TargetPhrase &targetPhrase
                            , ScoreComponentCollection &scoreBreakdown

From eab513b63527da05eee21a61a0c85c1a218c9e3e Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Wed, 18 Mar 2015 17:39:45 +0000
Subject: [PATCH 4/4] relational dependency language model

---
 moses/FF/Factory.cpp                          |   2 +
 moses/LM/Jamfile                              |   7 +-
 moses/LM/RDLM.cpp                             | 832 ++++++++++++++++++
 moses/LM/RDLM.h                               | 245 ++++++
 scripts/training/rdlm/README                  |  49 ++
 .../training/rdlm/average_null_embedding.py   |  45 +
 .../training/rdlm/extract_syntactic_ngrams.py | 262 ++++++
 scripts/training/rdlm/extract_vocab.py        | 169 ++++
 scripts/training/rdlm/train_model_head.sh     |  65 ++
 scripts/training/rdlm/train_model_label.sh    |  72 ++
 10 files changed, 1747 insertions(+), 1 deletion(-)
 create mode 100644 moses/LM/RDLM.cpp
 create mode 100644 moses/LM/RDLM.h
 create mode 100644 scripts/training/rdlm/README
 create mode 100755 scripts/training/rdlm/average_null_embedding.py
 create mode 100755 scripts/training/rdlm/extract_syntactic_ngrams.py
 create mode 100755 scripts/training/rdlm/extract_vocab.py
 create mode 100755 scripts/training/rdlm/train_model_head.sh
 create mode 100755 scripts/training/rdlm/train_model_label.sh

diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 37a818c08..218e458ff 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -114,6 +114,7 @@
 
 #ifdef LM_NEURAL
 #include "moses/LM/NeuralLMWrapper.h"
+#include "moses/LM/RDLM.h"
 #include "moses/LM/bilingual-lm/BiLM_NPLM.h"
 #endif
 
@@ -296,6 +297,7 @@ FeatureRegistry::FeatureRegistry()
 #endif
 #ifdef LM_NEURAL
   MOSES_FNAME2("NeuralLM", NeuralLMWrapper);
+  MOSES_FNAME(RDLM);
   MOSES_FNAME2("BilingualNPLM", BilingualLM_NPLM);
 #endif
 #ifdef LM_DALM
diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile
index 44bbecdd1..6dac9179f 100644
--- a/moses/LM/Jamfile
+++ b/moses/LM/Jamfile
@@ -90,8 +90,13 @@ if $(with-nplm) {
   lib nplm : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
   obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
   obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
-  alias neural : NeuralLMWrapper.o BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
+  obj RDLM.o : RDLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
+  alias neural : NeuralLMWrapper.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
+  alias bilinguallm : BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
+  alias rdlm : RDLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
   dependencies += neural ;
+  dependencies += bilinguallm ;
+  dependencies += rdlm ;
   lmmacros += LM_NEURAL ;
 }
 
diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp
new file mode 100644
index 000000000..f531ade28
--- /dev/null
+++ b/moses/LM/RDLM.cpp
@@ -0,0 +1,832 @@
+#include "RDLM.h"
+#include <vector>
+#include "moses/StaticData.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/InputFileStream.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+#include "neuralTM.h"
+
+namespace Moses
+{
+
+typedef Eigen::Map<Eigen::Matrix<int,Eigen::Dynamic,1> > EigenMap;
+
+RDLM::~RDLM() {
+  delete lm_head_base_instance_;
+  delete lm_label_base_instance_;
+}
+
+void RDLM::Load() {
+
+  lm_head_base_instance_ = new nplm::neuralTM();
+  lm_head_base_instance_->read(m_path_head_lm);
+
+  m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words();
+//   std::cerr << "Does head RDLM share vocabulary for input/output? " << m_sharedVocab << std::endl;
+
+  lm_label_base_instance_ = new nplm::neuralTM();
+  lm_label_base_instance_->read(m_path_label_lm);
+
+  if (m_premultiply) {
+    lm_head_base_instance_->premultiply();
+    lm_label_base_instance_->premultiply();
+  }
+
+  lm_head_base_instance_->set_cache(m_cacheSize);
+  lm_label_base_instance_->set_cache(m_cacheSize);
+
+  StaticData &staticData = StaticData::InstanceNonConst();
+  if (staticData.GetTreeStructure() == NULL) {
+    staticData.SetTreeStructure(this);
+  }
+
+  offset_up_head = 2*m_context_left + 2*m_context_right;
+  offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up;
+
+  size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2;
+  size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1;
+
+  UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(),
+                 "Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head);
+  UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(), 
+                 "Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label);
+
+  //get int value of commonly used tokens
+  static_head_null.resize(size_head);
+  for (unsigned int i = 0; i < size_head; i++) {
+    char numstr[20];
+    sprintf(numstr, "<null_%d>", i);
+    static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr);
+  }
+
+  static_label_null.resize(size_label);
+  for (unsigned int i = 0; i < size_label; i++) {
+    char numstr[20];
+    sprintf(numstr, "<null_%d>", i);
+    static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
+  }
+
+  static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head);
+
+  static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
+  static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
+
+  static_head_head = lm_head_base_instance_->lookup_input_word("<head_head>");
+  static_head_label = lm_head_base_instance_->lookup_input_word("<head_label>");
+  static_head_label_output = lm_label_base_instance_->lookup_output_word("<head_label>");
+
+  static_stop_head = lm_head_base_instance_->lookup_input_word("<stop_head>");
+  static_stop_label = lm_head_base_instance_->lookup_input_word("<stop_label>");
+  static_stop_label_output = lm_label_base_instance_->lookup_output_word("<stop_label>");
+  static_start_label_output = lm_label_base_instance_->lookup_output_word("<start_label>");
+
+  static_root_head = lm_head_base_instance_->lookup_input_word("<root_head>");
+  static_root_label = lm_head_base_instance_->lookup_input_word("<root_label>");
+
+  // just score provided file, then exit.
+  if (!m_debugPath.empty()) {
+      ScoreFile(m_debugPath);
+      exit(1);
+  }
+
+//   {
+//    TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA europäische]] [NN Zeit]]]"));
+//    TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
+//    TreePointer mytree4 (new InternalTree("[pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA]] [NN Zeit]]]"));
+//    TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred]]"));
+// 
+//    std::vector<int> ancestor_heads;
+//    std::vector<int> ancestor_labels;
+// 
+//    size_t boundary_hash(0);
+//    boost::array<float, 4> score;
+//    score.fill(0);
+//    std::cerr << "scoring: " << mytree3->GetString() << std::endl;
+//    std::vector<TreePointer> previous_trees;
+//    TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
+//    Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//    previous_trees.push_back(mytree3);
+//    back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
+//    std::cerr << "scoring: " << mytree4->GetString() << std::endl;
+//    Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//    mytree4->Combine(previous_trees);
+//    previous_trees.clear();
+//    previous_trees.push_back(mytree4);
+//    back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
+//    std::cerr << "scoring: " << mytree2->GetString() << std::endl;
+// 
+//    score[1] = 0;
+//    score[3] = 0;
+//    Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//    score[0] = 0;
+//    score[1] = 0;
+//    score[2] = 0;
+//    score[3] = 0;
+//    std::cerr << "scoring: " << mytree->GetString() << std::endl;
+//  
+//    Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//   }
+//    UTIL_THROW2("Finished");
+// 
+//   }
+// 
+//   {
+//    std::cerr << "BINARIZED\n\n";
+//    TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA europäische]] [NN Zeit]]]]]]"));
+//    TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
+//    TreePointer mytree4 (new InternalTree("[^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA]] [NN Zeit]]]"));
+//    TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred]]]]"));
+// 
+//    std::vector<int> ancestor_heads;
+//    std::vector<int> ancestor_labels;
+// 
+//    size_t boundary_hash(0);
+//    boost::array<float, 4> score;
+//    score.fill(0);
+//    std::cerr << "scoring: " << mytree3->GetString() << std::endl;
+//    std::vector<TreePointer> previous_trees;
+//    TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
+//    Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//    previous_trees.push_back(mytree3);
+//    back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
+//    std::cerr << "scoring: " << mytree4->GetString() << std::endl;
+//    Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//    mytree4->Combine(previous_trees);
+//    previous_trees.clear();
+//    previous_trees.push_back(mytree4);
+//    back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
+//    std::cerr << "scoring: " << mytree2->GetString() << std::endl;
+// 
+//    score[1] = 0;
+//    score[3] = 0;
+//    Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//    score[0] = 0;
+//    score[1] = 0;
+//    score[2] = 0;
+//    score[3] = 0;
+//    std::cerr << "scoring: " << mytree->GetString() << std::endl;
+//  
+//    Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+//    std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
+// 
+//   }
+//    UTIL_THROW2("Finished");
+
+}
+
+
+void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float, 4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual, int rescoring_levels) const
+{
+
+  // ignore terminal nodes
+  if (root->IsTerminal()) {
+    return;
+  }
+
+  // ignore glue rules
+  if (root->GetLabel() == m_glueSymbol) {
+    // recursion
+    for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
+    {
+      Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels);
+    }
+    return;
+  }
+
+  // ignore virtual nodes (in binarization; except if it's the root)
+  if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) {
+    // recursion
+    if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
+        root = back_pointers.find(root)->second.get();
+        rescoring_levels = m_context_up-1;
+    }
+    for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
+       Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels);
+    }
+    return;
+  }
+
+  // ignore start/end of sentence tags
+  if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) {
+    return;
+  }
+
+  nplm::neuralTM *lm_head = lm_head_backend_.get();
+  if (!lm_head) {
+    lm_head = new nplm::neuralTM(*lm_head_base_instance_);
+    lm_head->set_normalization(m_normalizeHeadLM);
+    lm_head->set_cache(m_cacheSize);
+    lm_head_backend_.reset(lm_head);
+  }
+
+  // ignore preterminal node (except if we're scoring root nodes)
+  if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
+    // root of tree: score without context
+    if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
+        std::vector<int> ngram_head_null (static_head_null);
+        ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel());
+        if (m_isPretermBackoff && ngram_head_null.back() == 0) {
+          ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel());
+        }
+        if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
+            std::vector<int>::iterator it = ngram_head_null.begin();
+            std::fill_n(it, m_context_left, static_start_head);
+            it += m_context_left;
+            std::fill_n(it, m_context_left, static_start_label);
+            it += m_context_left;
+            std::fill_n(it, m_context_right, static_stop_head);
+            it += m_context_right;
+            std::fill_n(it, m_context_right, static_stop_label);
+            it += m_context_right;
+            size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
+            it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
+            it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
+        }
+        if (ancestor_labels.size() >= m_context_up && !num_virtual) {
+          score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
+        }
+        else {
+          boost::hash_combine(boundary_hash, ngram_head_null.back());
+          score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
+        }
+    }
+    return;
+  // we only need to re-visit previous hypotheses if we have more context available.
+  } else if (root->IsLeafNT()) {
+    if (m_context_up > 1 && ancestor_heads.size()) {
+      root = back_pointers.find(root)->second.get();
+      // ignore preterminal node
+      if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
+        return;
+      }
+      rescoring_levels = m_context_up-1;
+    }
+    else {
+      return;
+    }
+  }
+
+  nplm::neuralTM *lm_label = lm_label_backend_.get();
+  if (!lm_label) {
+    lm_label = new nplm::neuralTM(*lm_label_base_instance_);
+    lm_label->set_normalization(m_normalizeLabelLM);
+    lm_label->set_cache(m_cacheSize);
+    lm_label_backend_.reset(lm_label);
+  }
+
+  std::pair<int,int> head_ids;
+  InternalTree* found = GetHead(root, back_pointers, head_ids);
+  if (found == NULL) {
+    head_ids = std::make_pair(static_dummy_head, static_dummy_head);
+  }
+
+  size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
+  const std::string & head_label = root->GetLabel();
+  bool virtual_head = false;
+  int reached_end = 0;
+  int label_idx, label_idx_out;
+  if (m_binarized && head_label[0] == '^') {
+      virtual_head = true;
+      if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) {
+        reached_end = 1; //indicate that we've seen the first symbol of the RHS
+      }
+      else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) {
+        reached_end = 2; // indicate that we've seen the last symbol of the RHS
+      }
+      // with 'full' binarization, direction is encoded in 2nd char
+      std::string clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1);
+      label_idx = lm_label->lookup_input_word(clipped_label);
+      label_idx_out = lm_label->lookup_output_word(clipped_label);
+  }
+  else {
+    reached_end = 3; // indicate that we've seen first and last symbol of the RHS
+    label_idx = lm_label->lookup_input_word(head_label);
+    label_idx_out = lm_label->lookup_output_word(head_label);
+  }
+
+  int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first;
+
+  // root of tree: score without context
+  if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
+      if (head_idx != static_dummy_head && head_idx != static_head_head) {
+        std::vector<int> ngram_head_null (static_head_null);
+        *(ngram_head_null.end()-2) = label_idx;
+        ngram_head_null.back() = head_ids.second;
+        if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
+            std::vector<int>::iterator it = ngram_head_null.begin();
+            std::fill_n(it, m_context_left, static_start_head);
+            it += m_context_left;
+            std::fill_n(it, m_context_left, static_start_label);
+            it += m_context_left;
+            std::fill_n(it, m_context_right, static_stop_head);
+            it += m_context_right;
+            std::fill_n(it, m_context_right, static_stop_label);
+            it += m_context_right;
+            it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
+            it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
+            score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
+        }
+        else {
+            boost::hash_combine(boundary_hash, ngram_head_null.back());
+            score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
+        }
+      }
+      std::vector<int> ngram_label_null (static_label_null);
+      ngram_label_null.back() = label_idx_out;
+      if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
+          std::vector<int>::iterator it = ngram_label_null.begin();
+          std::fill_n(it, m_context_left, static_start_head);
+          it += m_context_left;
+          std::fill_n(it, m_context_left, static_start_label);
+          it += m_context_left;
+          std::fill_n(it, m_context_right, static_stop_head);
+          it += m_context_right;
+          std::fill_n(it, m_context_right, static_stop_label);
+          it += m_context_right;
+          it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
+          it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
+          score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size())));
+      }
+      else {
+          boost::hash_combine(boundary_hash, ngram_label_null.back());
+          score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size())));
+      }
+  }
+
+  ancestor_heads.push_back(head_idx);
+  ancestor_labels.push_back(label_idx);
+
+  if (virtual_head) {
+    num_virtual = m_context_up;
+  }
+  else if (num_virtual) {
+      --num_virtual;
+  }
+
+
+  // fill ancestor context (same for all children)
+  if (context_up_nonempty < m_context_up) {
+      ++context_up_nonempty;
+  }
+  size_t up_padding = m_context_up - context_up_nonempty;
+
+  std::vector<int> ngram (static_label_null);
+
+  std::vector<int>::iterator it = ngram.begin() + offset_up_head;
+  if (up_padding > 0) {
+    it += up_padding;
+  }
+
+  it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it);
+
+  if (up_padding > 0) {
+    it += up_padding;
+  }
+
+  it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it);
+
+  // create vectors of head/label IDs of all children
+  int num_children = root->GetLength();
+
+  // get number of children after unbinarization
+  if (m_binarized) {
+    num_children = 0;
+    UnbinarizedChildren real_children(root, back_pointers, m_binarized);
+    for (std::vector<TreePointer>::const_iterator it = real_children.begin(); it != real_children.end(); it = ++real_children) {
+      num_children++;
+    }
+  }
+
+  if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++; //also predict start label
+  if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++; //also predict end label
+
+  std::vector<int> heads(num_children);
+  std::vector<int> labels(num_children);
+  std::vector<int> heads_output(num_children);
+  std::vector<int> labels_output(num_children);
+
+  GetChildHeadsAndLabels(root, back_pointers, reached_end, lm_head, lm_label, heads, labels, heads_output, labels_output);
+
+  //left padding; only need to add this initially
+  if (reached_end == 1 || reached_end == 3) {
+    std::fill_n(ngram.begin(), m_context_left, static_start_head);
+    std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label);
+  }
+  size_t left_padding = m_context_left;
+  size_t left_offset = 0;
+  size_t right_offset = std::min(heads.size(), m_context_right + 1);
+  size_t right_padding = m_context_right + 1 - right_offset;
+
+  // construct context of label model and predict label
+  for (size_t i = 0; i != heads.size(); i++) {
+
+    std::vector<int>::iterator it = ngram.begin();
+
+    if (left_padding > 0) {
+        it += left_padding;
+    }
+
+    it = std::copy(heads.begin()+left_offset, heads.begin()+i, it);
+
+    if (left_padding > 0) {
+        it += left_padding;
+    }
+
+    it = std::copy(labels.begin()+left_offset, labels.begin()+i, it);
+
+    it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it);
+
+    if (right_padding > 0) {
+        if (reached_end == 2 || reached_end == 3) {
+            std::fill_n(it, right_padding, static_stop_head);
+            it += right_padding;
+        }
+        else {
+            std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it);
+        }
+    }
+
+    it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it);
+
+    if (right_padding > 0) {
+        if (reached_end == 2 || reached_end == 3) {
+            std::fill_n(it, right_padding, static_stop_label);
+            it += right_padding;
+        }
+        else {
+            std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it);
+        }
+    }
+
+    ngram.back() = labels_output[i];
+
+    if (ancestor_labels.size() >= m_context_up && !num_virtual) {
+      score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
+    }
+    else {
+      boost::hash_combine(boundary_hash, ngram.back());
+      score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
+    }
+
+    // construct context of head model and predict head
+    if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) {
+
+      ngram.back() = labels[i];
+      ngram.push_back(heads_output[i]);
+
+      if (ancestor_labels.size() >= m_context_up && !num_virtual) {
+        score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
+      }
+      else {
+        boost::hash_combine(boundary_hash, ngram.back());
+        score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
+      }
+      ngram.pop_back();
+    }
+
+    // next time, we need to add less start symbol padding
+    if (left_padding)
+        left_padding--;
+    else
+        left_offset++;
+
+    if (right_offset < heads.size())
+        right_offset++;
+    else
+        right_padding++;
+  }
+
+
+  if (rescoring_levels == 1) {
+      ancestor_heads.pop_back();
+      ancestor_labels.pop_back();
+      return;
+  }
+  // recursion
+  for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
+  {
+    Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels - 1);
+  }
+  ancestor_heads.pop_back();
+  ancestor_labels.pop_back();
+}
+
+InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree* head_ptr) const
+{
+  InternalTree *tree;
+
+  for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
+  {
+    if ((*it)->IsLeafNT()) {
+      tree = back_pointers.find(it->get())->second.get();
+    }
+    else {
+      tree = it->get();
+    }
+
+    if (m_binarized && tree->GetLabel()[0] == '^') {
+        head_ptr = GetHead(tree, back_pointers, IDs, head_ptr);
+        if (head_ptr != NULL && !m_isPTKVZ) {
+          return head_ptr;
+      }
+    }
+
+    // assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head
+    // if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned
+    else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) {
+      head_ptr = tree;
+      if (!m_isPTKVZ) {
+        GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
+        return head_ptr;
+      }
+    }
+
+    // add PTKVZ to lemma of verb
+    else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") {
+      InternalTree *tree2;
+      for (std::vector<TreePointer>::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) {
+        if ((*it2)->IsLeafNT()) {
+          tree2 = back_pointers.find(it2->get())->second.get();
+        }
+        else {
+          tree2 = it2->get();
+        }
+        if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) {
+          std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel();
+          GetIDs(verb, head_ptr->GetLabel(), IDs);
+          return head_ptr;
+        }
+      }
+    }
+  }
+
+  if (head_ptr != NULL) {
+    GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
+  }
+  return head_ptr;
+}
+
+
+void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const
+{
+  std::pair<int,int> child_ids;
+  InternalTree* found;
+  size_t j = 0;
+
+  // score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
+  if (m_context_right && (reached_end == 1 || reached_end == 3)) {
+    heads[j] = static_start_head;
+    labels[j] = static_start_label;
+    labels_output[j] = static_start_label_output;
+    j++;
+  }
+
+  UnbinarizedChildren real_children(root, back_pointers, m_binarized);
+
+  // extract head words / labels
+  for (std::vector<TreePointer>::const_iterator itx = real_children.begin(); itx != real_children.end(); itx = ++real_children) {
+    if ((*itx)->IsTerminal()) {
+        std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl;
+        std::cerr << "children: ";
+        for (std::vector<TreePointer>::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) {
+          std::cerr << (*itx2)->GetLabel() << " ";
+        }
+        std::cerr << std::endl;
+        // resize vectors (should we throw exception instead?)
+        heads.pop_back();
+        labels.pop_back();
+        heads_output.pop_back();
+        labels_output.pop_back();
+        continue;
+    }
+    InternalTree* child = itx->get();
+    // also go through trees or previous hypotheses to rescore nodes for which more context has become available
+    if ((*itx)->IsLeafNT()) {
+      child = back_pointers.find(itx->get())->second.get();
+    }
+
+    // preterminal node
+    if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
+      heads[j] = static_head_head;
+      labels[j] = static_head_label;
+      labels_output[j] = static_head_label_output;
+      j++;
+      continue;
+    }
+
+    found = GetHead(child, back_pointers, child_ids);
+    if (found == NULL) {
+      child_ids = std::make_pair(static_dummy_head, static_dummy_head);
+    }
+
+    labels[j] = lm_head->lookup_input_word(child->GetLabel());
+    labels_output[j] = lm_label->lookup_output_word(child->GetLabel());
+    heads[j] = child_ids.first;
+    heads_output[j] = child_ids.second;
+    j++;
+  }
+
+  // score end label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
+  if (m_context_left && (reached_end == 2 || reached_end == 3)) {
+    heads[j] = static_stop_head;
+    labels[j] = static_stop_label;
+    labels_output[j] = static_stop_label_output;
+  }
+}
+
+
+void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const
+{
+  IDs.first = lm_head_base_instance_->lookup_input_word(head);
+  if (m_isPretermBackoff && IDs.first == 0) {
+    IDs.first = lm_head_base_instance_->lookup_input_word(preterminal);
+  }
+  if (m_sharedVocab) {
+    IDs.second = IDs.first;
+  }
+  else {
+    IDs.second = lm_head_base_instance_->lookup_output_word(head);
+    if (m_isPretermBackoff && IDs.second == 0) {
+      IDs.second = lm_head_base_instance_->lookup_output_word(preterminal);
+    }
+  }
+}
+
+
+void RDLM::PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const
+{
+  for (size_t i = 0; i < ngram.size()-1; i++) {
+      std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " ";
+  }
+  std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " ";
+
+  for (size_t i = 0; i < ngram.size(); i++) {
+      std::cerr << ngram[i] << " ";
+  }
+  std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl;
+}
+
+
+RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const
+{
+
+  TreePointerMap ret;
+  std::vector<TreePointer>::iterator it;
+  bool found = false;
+  InternalTree::leafNT next_leafNT(root);
+  for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
+      found = next_leafNT(it);
+      if (found) {
+          ret[it->get()] = *it_prev;
+      }
+      else {
+          std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
+      }
+  }
+  return ret;
+}
+
+void RDLM::ScoreFile(std::string &path)
+{
+    InputFileStream inStream(path);
+    std::string line, null;
+    std::vector<int> ancestor_heads(m_context_up, static_root_head);
+    std::vector<int> ancestor_labels(m_context_up, static_root_label);
+    while(getline(inStream, line)) {
+        TreePointerMap back_pointers;
+        boost::array<float, 4> score;
+        score.fill(0);
+        InternalTree* mytree (new InternalTree(line));
+        size_t boundary_hash = 0;
+        Score(mytree, back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+        std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl;
+    }
+}
+
+
+void RDLM::SetParameter(const std::string& key, const std::string& value)
+{
+  std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
+  if (key == "tuneable") {
+    m_tuneable = Scan<bool>(value);
+  } else if (key == "filterable") { //ignore
+  } else if (key == "path_head_lm") {
+      m_path_head_lm = value;
+  } else if (key == "path_label_lm") {
+      m_path_label_lm = value;
+  } else if (key == "ptkvz") {
+      m_isPTKVZ = Scan<bool>(value);
+  } else if (key == "backoff") {
+      m_isPretermBackoff = Scan<bool>(value);
+  } else if (key == "context_up") {
+      m_context_up = Scan<size_t>(value);
+  } else if (key == "context_left") {
+      m_context_left = Scan<size_t>(value);
+  } else if (key == "context_right") {
+      m_context_right = Scan<size_t>(value);
+  } else if (key == "debug_path") {
+      m_debugPath = value;
+  } else if (key == "premultiply") {
+      m_premultiply = Scan<bool>(value);
+  } else if (key == "rerank") {
+      m_rerank = Scan<bool>(value);
+  } else if (key == "normalize_head_lm") {
+      m_normalizeHeadLM = Scan<bool>(value);
+  } else if (key == "normalize_label_lm") {
+      m_normalizeLabelLM = Scan<bool>(value);
+  } else if (key == "binarized") {
+      if (value == "left")
+        m_binarized = 1;
+      else if (value == "right")
+        m_binarized = 2;
+      else if (value == "full")
+        m_binarized = 3;
+      else
+        UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
+  } else if (key == "glue_symbol") {
+      m_glueSymbol = value;
+  } else if (key == "cache_size") {
+      m_cacheSize = Scan<int>(value);
+  } else {
+    UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
+  }
+}
+
+
+FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
+                                   , int featureID /* used to index the state in the previous hypotheses */
+                                   , ScoreComponentCollection* accumulator) const
+{
+  if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
+    const std::string *tree = property->GetValueString();
+    TreePointer mytree (boost::make_shared<InternalTree>(*tree));
+
+    //get subtrees (in target order)
+    std::vector<TreePointer> previous_trees;
+    float prev_approx_head = 0, prev_approx_label = 0; //approximated (due to lack of context) LM costs from previous hypos
+    for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
+      const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
+      if (word.IsNonTerminal()) {
+        size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
+        const RDLMState* prev = static_cast<const RDLMState*>(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID));
+        previous_trees.push_back(prev->GetTree());
+        prev_approx_head -= prev->GetApproximateScoreHead();
+        prev_approx_label -= prev->GetApproximateScoreLabel();
+      }
+    }
+    size_t ff_idx = accumulator->GetIndexes(this).first;
+
+    accumulator->PlusEquals(ff_idx, prev_approx_head);
+    accumulator->PlusEquals(ff_idx+1, prev_approx_label);
+
+    bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag));
+    std::vector<int> ancestor_heads ((full_sentence ? m_context_up : 0), static_root_head);
+    std::vector<int> ancestor_labels ((full_sentence ? m_context_up : 0), static_root_label);
+    ancestor_heads.reserve(10);
+    ancestor_labels.reserve(10);
+
+    TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees);
+    boost::array<float, 4> score; // score_head, approx_score_head, score_label, approx_score_label
+    score.fill(0);
+    //hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning.
+    size_t boundary_hash = 0;
+    if (!m_rerank) {
+      Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+    accumulator->PlusEquals(ff_idx, score[0] + score[1]);
+    accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
+    }
+    mytree->Combine(previous_trees);
+    if (m_rerank && full_sentence) {
+      Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
+      accumulator->PlusEquals(ff_idx, score[0] + score[1]);
+      accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
+    }
+    if (m_binarized && full_sentence) {
+        mytree->Unbinarize();
+    }
+
+    return new RDLMState(mytree, score[1], score[3], boundary_hash);
+  }
+  else {
+    UTIL_THROW2("Error: RDLM active, but no internal tree structure found");
+  }
+
+}
+
+}
diff --git a/moses/LM/RDLM.h b/moses/LM/RDLM.h
new file mode 100644
index 000000000..8ae49ce76
--- /dev/null
+++ b/moses/LM/RDLM.h
@@ -0,0 +1,245 @@
+#include <string>
+#include <map>
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/FFState.h"
+#include "moses/FF/InternalTree.h"
+
+#include <boost/thread/tss.hpp>
+#include <boost/array.hpp>
+
+// relational dependency language model, described in:
+// Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. Transactions of the Association for Computational Linguistics.
+// see 'scripts/training/rdlm' for training scripts
+
+namespace nplm {
+  class neuralTM;
+}
+
+namespace Moses
+{
+
+class RDLMState : public TreeState
+{
+  float m_approx_head; //score that was approximated due to lack of context
+  float m_approx_label;
+  size_t m_hash;
+public:
+  RDLMState(TreePointer tree, float approx_head, float approx_label, size_t hash)
+    : TreeState(tree)
+    , m_approx_head(approx_head)
+    , m_approx_label(approx_label)
+    , m_hash(hash)
+  {}
+
+  float GetApproximateScoreHead() const {
+      return m_approx_head;
+  }
+
+  float GetApproximateScoreLabel() const {
+      return m_approx_label;
+  }
+
+  size_t GetHash() const {
+      return m_hash;
+  }
+
+  int Compare(const FFState& other) const {
+      if (m_hash == static_cast<const RDLMState*>(&other)->GetHash()) return 0;
+      else if (m_hash > static_cast<const RDLMState*>(&other)->GetHash()) return 1;
+      else return -1;
+  }
+};
+
+class RDLM : public StatefulFeatureFunction
+{
+  typedef std::map<InternalTree*,TreePointer> TreePointerMap;
+
+  nplm::neuralTM* lm_head_base_instance_;
+  mutable boost::thread_specific_ptr<nplm::neuralTM> lm_head_backend_;
+
+  nplm::neuralTM* lm_label_base_instance_;
+  mutable boost::thread_specific_ptr<nplm::neuralTM> lm_label_backend_;
+
+  std::string dummy_head;
+  std::string m_glueSymbol;
+  std::string m_startSymbol;
+  std::string m_endSymbol;
+  std::string m_endTag;
+  std::string m_path_head_lm;
+  std::string m_path_label_lm;
+  bool m_isPTKVZ;
+  bool m_isPretermBackoff;
+  size_t m_context_left;
+  size_t m_context_right;
+  size_t m_context_up;
+  bool m_premultiply;
+  bool m_rerank;
+  bool m_normalizeHeadLM;
+  bool m_normalizeLabelLM;
+  bool m_sharedVocab;
+  std::string m_debugPath; // score all trees in the provided file, then exit
+  int m_binarized;
+  int m_cacheSize;
+
+  size_t offset_up_head;
+  size_t offset_up_label;
+
+  size_t size_head;
+  size_t size_label;
+  std::vector<int> static_label_null;
+  std::vector<int> static_head_null;
+  int static_dummy_head;
+  int static_start_head;
+  int static_start_label;
+  int static_stop_head;
+  int static_stop_label;
+  int static_head_head;
+  int static_head_label;
+  int static_root_head;
+  int static_root_label;
+
+  int static_head_label_output;
+  int static_stop_label_output;
+  int static_start_label_output;
+
+public:
+  RDLM(const std::string &line)
+    : StatefulFeatureFunction(2, line)
+    , dummy_head("<dummy_head>")
+    , m_glueSymbol("Q")
+    , m_startSymbol("SSTART")
+    , m_endSymbol("SEND")
+    , m_endTag("</s>")
+    , m_isPTKVZ(false)
+    , m_isPretermBackoff(true)
+    , m_context_left(3)
+    , m_context_right(0)
+    , m_context_up(2)
+    , m_premultiply(true)
+    , m_rerank(false)
+    , m_normalizeHeadLM(false)
+    , m_normalizeLabelLM(false)
+    , m_sharedVocab(false)
+    , m_binarized(0)
+    , m_cacheSize(1000000)
+    {
+      ReadParameters();
+    }
+
+  ~RDLM();
+
+  virtual const FFState* EmptyHypothesisState(const InputType &input) const {
+    return new RDLMState(TreePointer(), 0, 0, 0);
+  }
+
+  void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float,4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const;
+  InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree * head_ptr=NULL) const;
+  void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const;
+  void GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const;
+  void ScoreFile(std::string &path); //for debugging
+  void PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const; //for debugging
+
+  TreePointerMap AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const;
+
+  bool IsUseable(const FactorMask &mask) const {
+    return true;
+  }
+
+  void SetParameter(const std::string& key, const std::string& value);
+  void EvaluateInIsolation(const Phrase &source
+                , const TargetPhrase &targetPhrase
+                , ScoreComponentCollection &scoreBreakdown
+                , ScoreComponentCollection &estimatedFutureScore) const {};
+  void EvaluateWithSourceContext(const InputType &input
+                , const InputPath &inputPath
+                , const TargetPhrase &targetPhrase
+                , const StackVec *stackVec
+                , ScoreComponentCollection &scoreBreakdown
+                , ScoreComponentCollection *estimatedFutureScore = NULL) const {};
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const {};
+  FFState* EvaluateWhenApplied(
+    const Hypothesis& cur_hypo,
+    const FFState* prev_state,
+    ScoreComponentCollection* accumulator) const {UTIL_THROW(util::Exception, "Not implemented");};
+  FFState* EvaluateWhenApplied(
+    const ChartHypothesis& /* cur_hypo */,
+    int /* featureID - used to index the state in the previous hypotheses */,
+    ScoreComponentCollection* accumulator) const;
+
+  void Load();
+
+  // Iterator-class that yields all children of a node; if child is virtual node of binarized tree, its children are yielded instead.
+  class UnbinarizedChildren
+  {
+  private:
+      std::vector<TreePointer>::const_iterator iter;
+      std::vector<TreePointer>::const_iterator _begin;
+      std::vector<TreePointer>::const_iterator _end;
+      InternalTree* current;
+      const TreePointerMap & back_pointers;
+      bool binarized;
+      std::vector<std::pair<InternalTree*,std::vector<TreePointer>::const_iterator> > stack;
+
+  public:
+      UnbinarizedChildren(InternalTree* root, const TreePointerMap & pointers, bool binary):
+        current(root),
+        back_pointers(pointers),
+        binarized(binary)
+        {
+          stack.reserve(10);
+          _end = current->GetChildren().end();
+          iter = current->GetChildren().begin();
+          // expand virtual node
+          while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
+            stack.push_back(std::make_pair(current, iter));
+            // also go through trees or previous hypotheses to rescore nodes for which more context has become available
+            if ((*iter)->IsLeafNT()) {
+              current = back_pointers.find(iter->get())->second.get();
+            }
+            else {
+              current = iter->get();
+            }
+            iter = current->GetChildren().begin();
+          }
+          _begin = iter;
+        }
+
+      std::vector<TreePointer>::const_iterator begin() const { return _begin; }
+      std::vector<TreePointer>::const_iterator end() const { return _end; }
+
+      std::vector<TreePointer>::const_iterator operator++() {
+        iter++;
+        if (iter == current->GetChildren().end()) {
+          while (!stack.empty()) {
+            std::pair<InternalTree*,std::vector<TreePointer>::const_iterator> & active = stack.back();
+            current = active.first;
+            iter = ++active.second;
+            stack.pop_back();
+            if (iter != current->GetChildren().end()) {
+              break;
+            }
+          }
+          if (iter == _end) {
+            return iter;
+          }
+        }
+        // expand virtual node
+        while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
+          stack.push_back(std::make_pair(current, iter));
+          // also go through trees or previous hypotheses to rescore nodes for which more context has become available
+          if ((*iter)->IsLeafNT()) {
+            current = back_pointers.find(iter->get())->second.get();
+          }
+          else {
+            current = iter->get();
+          }
+          iter = current->GetChildren().begin();
+        }
+        return iter;
+      }
+  };
+
+};
+
+}
diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README
new file mode 100644
index 000000000..347e71f6d
--- /dev/null
+++ b/scripts/training/rdlm/README
@@ -0,0 +1,49 @@
+RDLM: relational dependency language model
+------------------------------------------
+
+This is a language model for the string-to-tree decoder with a dependency grammar.
+It should work with any corpus with projective dependency annotation in ConLL format,
+converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
+It depends on NPLM for neural network training and querying.
+
+Prerequisites
+-------------
+
+Install NPLM and compile moses with it. See the instructions in the Moses documentation for details:
+
+  http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel
+
+Training
+--------
+
+RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
+If you have such a system, you can train RDLM on the target side of the same parallel corpus
+that is used for training the translation model.
+
+To train the model on additional monolingual data, or test it on some held-out test/dev data,
+parse and process it in the same way that the parallel corpus has been processed.
+This includes tokenization, parsing, truecasing, compound splitting etc.
+
+RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
+set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
+
+  ./train_model_head.sh rdlm_head.nnlm working_dir_head
+  ./train_model_label.sh rdlm_label.nnlm working_dir_label
+
+
+Decoding
+--------
+
+To use RDLM during decoding, add the following line to your moses.ini config:
+
+  [feature]
+  RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
+
+  [weight]
+  RDLM 0.1 0.1
+
+Reference
+---------
+
+Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation.
+  Transactions of the Association for Computational Linguistics.
diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py
new file mode 100755
index 000000000..cb67c9d75
--- /dev/null
+++ b/scripts/training/rdlm/average_null_embedding.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# average embeddings of special null words for RDLM.
+# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
+
+import sys
+import os
+import numpy
+
+def load_model(model_file):
+    return nplm.NeuralLM.from_file(model_file)
+
+def get_weights(path, vocab, len_context):
+    d = [[0]*vocab for i in range(len_context)]
+    for line in open(path):
+        for i, word in enumerate(line.split()[:-1]):
+            d[i][int(word)] += 1
+    return d
+
+if __name__ == "__main__":
+
+    nplm_path = sys.argv[1]
+    model_input = sys.argv[2]
+    training_instances = sys.argv[3]
+    model_output = sys.argv[4]
+
+    sys.path.append(os.path.join(nplm_path,'python'))
+    import nplm
+
+    model = load_model(model_input)
+
+    len_context = len(open(training_instances).readline().split())-1
+
+    sys.stderr.write('reading ngrams...')
+    weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
+    sys.stderr.write('done\n')
+
+    for i in range(len_context):
+        index = model.word_to_index_input['<null_{0}>'.format(i)]
+        model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
+    sys.stderr.write('writing model...')
+    model.to_file(open(model_output,'w'))
+    sys.stderr.write('done\n')
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
new file mode 100755
index 000000000..12d62d1e6
--- /dev/null
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
+# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
+# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
+
+from __future__ import print_function, unicode_literals, division
+import sys
+import codecs
+import io
+import argparse
+
+try:
+    from lxml import etree as ET
+except ImportError:
+    from xml.etree import cElementTree as ET
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
+
+    parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
+                        choices=['label', 'head'], required=True)
+    parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
+                        help='input layer vocabulary file (one item per line; first line \'<unk>\')')
+    parser.add_argument('--output_vocab', metavar='PATH', type=str,
+                        help='output layer vocabulary file (default: use input layer vocabulary)')
+    parser.add_argument('--left_context', metavar='INT', type=int,
+                        help='size of context vector for left siblings (default: %(default)s)', default=3)
+    parser.add_argument('--right_context', metavar='INT', type=int,
+                        help='size of context vector for right siblings (default: %(default)s)', default=0)
+    parser.add_argument('--up_context', metavar='INT', type=int,
+                        help='size of context vector for ancestors (default: %(default)s)', default=2)
+    parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
+                        help='glue symbol. Will be skipped during extraction (default: %(default)s)')
+    parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
+                        help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
+    parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
+                        help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
+    parser.add_argument('--ptkvz', action='store_true',
+                        help='special rule for German dependency trees: concatenate separable verb prefix and verb')
+    return parser.parse_args()
+
+def escape_text(s):
+
+    s = s.replace('|','&#124;') # factor separator
+    s = s.replace('[','&#91;') # syntax non-terminal
+    s = s.replace(']','&#93;') # syntax non-terminal
+    s = s.replace('\'','&apos;') # xml special character
+    s = s.replace('"','&quot;') # xml special character
+    return s
+
+# deterministic heuristic to get head of subtree
+def get_head(xml, add_ptkvz):
+    head = None
+    preterminal = None
+    for child in xml:
+        if not len(child):
+            if head is not None:
+                continue
+            preterminal = child.get('label')
+            head = escape_text(child.text.strip())
+
+        elif add_ptkvz and head and child.get('label') == 'avz':
+            for grandchild in child:
+                if grandchild.get('label') == 'PTKVZ':
+                    head = escape_text(grandchild.text.strip()) + head
+                    break
+
+    return head, preterminal
+
+def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
+
+    if len(xml):
+
+        # skip glue rules
+        if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
+          for child in xml:
+            get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+          return
+
+        # skip virtual nodes
+        if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
+          return
+
+        if not parent_heads:
+            parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
+            parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
+
+            head, preterminal = get_head(xml, options.ptkvz)
+            if not head:
+                head = '<dummy_head>'
+                preterminal = head
+            elif head not in vocab:
+                head = preterminal
+
+            label = xml.get('label')
+
+            # syntactic n-gram for root node
+            int_list = []
+            int_list.extend([start_head_idx] * options.left_context)
+            int_list.extend([start_label_idx] * options.left_context)
+            int_list.extend([stop_head_idx] * options.right_context)
+            int_list.extend([stop_label_idx] * options.right_context)
+            int_list.extend(parent_heads)
+            int_list.extend(parent_labels)
+
+            if options.mode == 'label':
+                int_list.append(output_vocab.get(label, 0))
+                sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+            elif options.mode == 'head' and not head == '<dummy_head>':
+                int_list.append(vocab.get(label, 0))
+                int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
+                sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+
+            parent_heads.append(vocab.get(head, 0))
+            parent_labels.append(vocab.get(label, 0))
+
+        # virtual start/end-of-subtree tag
+        if len(xml) > 0:
+            if options.right_context:
+                start = ET.Element('tree')
+                start2 = ET.Element('tree')
+                start.set('label','<start_label>')
+                start2.set('label','XY')
+                start2.text = '<start_head>'
+                start.append(start2)
+                xml.insert(0,start)
+            if options.left_context:
+                end = ET.Element('tree')
+                end2 = ET.Element('tree')
+                end.set('label','<stop_label>')
+                end2.set('label','XY')
+                end2.text = '<stop_head>'
+                end.append(end2)
+                xml.append(end)
+
+
+        heads = []
+        preterminals = []
+        labels = []
+
+        for child in xml:
+            if not len(child):
+                # mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
+                head_child = '<head_head>'
+                preterminal_child = head_child
+                child_label = '<head_label>'
+            else:
+                head_child, preterminal_child = get_head(child, options.ptkvz)
+                child_label = child.get('label')
+
+            if head_child is None:
+                head_child = '<dummy_head>'
+
+            heads.append(head_child)
+            preterminals.append(preterminal_child)
+            labels.append(child_label)
+
+            heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
+            labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
+
+        #ancestor context is same for all children
+        up_heads = parent_heads[-options.up_context:]
+        up_labels = parent_labels[-options.up_context:]
+
+        for i,child in enumerate(xml):
+
+            # skip some special symbols, but recursively extract n-grams for its children
+            if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
+                parent_heads.append(vocab.get(heads[i], 0))
+                parent_labels.append(vocab.get(labels[i], 0))
+                get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+                parent_heads.pop()
+                parent_labels.pop()
+                continue
+
+            previous_heads = heads_idx[max(0,i-options.left_context):i]
+            previous_labels = labels_idx[max(0,i-options.left_context):i]
+
+            subsequent_heads = heads_idx[i+1:i+options.right_context+1]
+            subsequent_labels = labels_idx[i+1:i+options.right_context+1]
+
+            if len(previous_heads) < options.left_context:
+                previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
+                previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
+
+            if len(subsequent_heads) < options.right_context:
+                subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
+                subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
+
+            int_list = []
+            int_list.extend(previous_heads)
+            int_list.extend(previous_labels)
+            int_list.extend(subsequent_heads)
+            int_list.extend(subsequent_labels)
+            int_list.extend(up_heads)
+            int_list.extend(up_labels)
+            if options.mode == 'label':
+                int_list.append(output_vocab.get(labels[i], 0))
+            elif options.mode == 'head':
+                int_list.append(vocab.get(labels[i], 0))
+                int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
+
+            sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+
+            parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
+            parent_labels.append(vocab.get(labels[i], 0))
+
+            get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
+
+            parent_heads.pop()
+            parent_labels.pop()
+
+
+def load_vocab(path):
+    v = {}
+    for i,line in enumerate(io.open(path, encoding="UTF-8")):
+        v[line.strip()] = i
+    return v
+
+if __name__ == '__main__':
+
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+
+    options = parse_arguments()
+
+    vocab = load_vocab(options.vocab)
+
+    if options.output_vocab is None:
+        sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
+        output_vocab = vocab
+    else:
+        output_vocab = load_vocab(options.output_vocab)
+
+    start_head_idx = vocab.get("<start_head>", 0)
+    start_label_idx = vocab.get("<start_label>", 0)
+    stop_head_idx = vocab.get("<stop_head>", 0)
+    stop_label_idx = vocab.get("<stop_label>", 0)
+
+    i = 0
+    for line in sys.stdin:
+        if i and not i % 50000:
+            sys.stderr.write('.')
+        if i and not i % 1000000:
+            sys.stderr.write('{0}\n'.format(i))
+        if sys.version_info < (3, 0):
+            if line == b'\n':
+                continue
+            # hack for older moses versions with inconsistent encoding of "|"
+            line = line.replace(b'&bar;', b'&#124;')
+        else:
+            if line == '\n':
+                continue
+            # hack for older moses versions with inconsistent encoding of "|"
+            line = line.replace('&bar;', '&#124;')
+        xml = ET.fromstring(line)
+        get_syntactic_ngrams(xml, options, vocab, output_vocab)
+        i += 1
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
new file mode 100755
index 000000000..684fdcd32
--- /dev/null
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# extract 5 vocabulary files from parsed corpus in moses XML format
+
+from __future__ import print_function, unicode_literals, division
+import sys
+import codecs
+import io
+import argparse
+from collections import Counter
+
+try:
+    from lxml import etree as ET
+except ImportError:
+    from xml.etree import cElementTree as ET
+
+def parse_arguments():
+
+    help_text =  "generate 5 vocabulary files from parsed corpus in moses XML format\n"
+    help_text += "  [PREFIX].special: around 40 symbols reserved for RDLM\n";
+    help_text += "  [PREFIX].preterminals: preterminal symbols\n";
+    help_text += "  [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
+    help_text += "  [PREFIX].terminals: terminal symbols\n";
+    help_text += "  [PREFIX].all: all of the above\n"
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
+
+    parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
+                        help='input text (default: standard input).')
+    parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
+                        help='output prefix (default: "vocab")')
+    parser.add_argument('--ptkvz', action="store_true",
+                    help='special rule for German dependency trees: attach separable verb prefixes to verb')
+
+    args = parser.parse_args()
+
+    return args
+
+def escape_text(s):
+
+    s = s.replace('|','&#124;') # factor separator
+    s = s.replace('[','&#91;') # syntax non-terminal
+    s = s.replace(']','&#93;') # syntax non-terminal
+    s = s.replace('\'','&apos;') # xml special character
+    s = s.replace('"','&quot;') # xml special character
+    return s
+
+# deterministic heuristic to get head of subtree
+def get_head(xml):
+    head = None
+    preterminal = None
+    for child in xml:
+        if not len(child):
+            if head is not None:
+                continue
+            preterminal = child.get('label')
+            head = escape_text(child.text.strip())
+
+        # hack for split compounds
+        elif child[-1].get('label') == 'SEGMENT':
+            return escape_text(child[-1].text.strip()), 'SEGMENT'
+
+        elif args.ptkvz and head and child.get('label') == 'avz':
+            for grandchild in child:
+                if grandchild.get('label') == 'PTKVZ':
+                    head = escape_text(grandchild.text.strip()) + head
+                    break
+
+    return head, preterminal
+
+def get_vocab(xml):
+
+    if len(xml):
+
+        head, preterminal = get_head(xml)
+        if not head:
+            head = '<null>'
+            preterminal = '<null>'
+
+        heads[head] += 1
+        preterminals[preterminal] += 1
+
+        label = xml.get('label')
+
+        nonterminals[label] += 1
+
+        for child in xml:
+            if not len(child):
+                continue
+            get_vocab(child)
+
+
+
+if __name__ == '__main__':
+
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+    args = parse_arguments()
+
+    heads = Counter()
+    preterminals = Counter()
+    nonterminals = Counter()
+
+    i = 0
+    for line in args.input:
+        if i and not i % 50000:
+            sys.stderr.write('.')
+        if i and not i % 1000000:
+            sys.stderr.write('{0}\n'.format(i))
+        if line == '\n':
+            continue
+
+        # hack for older moses versions with inconsistent encoding of "|"
+        line = line.replace('&bar;', '&#124;')
+
+        xml = ET.fromstring(line)
+        get_vocab(xml)
+        i += 1
+
+    special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
+
+    for i in range(30):
+      special_tokens.append('<null_{0}>'.format(i))
+
+    f = io.open(args.output + '.special', 'w', encoding='UTF-8')
+    for item in special_tokens:
+        f.write(item + '\n')
+    f.close()
+
+    f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
+    for item in sorted(preterminals, key=preterminals.get, reverse=True):
+        f.write(item + '\n')
+    f.close()
+
+    f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
+    for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
+        f.write(item + '\n')
+    f.close()
+
+    f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
+    for item in sorted(heads, key=heads.get, reverse=True):
+        f.write(item + '\n')
+    f.close()
+
+    f = io.open(args.output + '.all', 'w', encoding='UTF-8')
+    special_tokens_set = set(special_tokens)
+    for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
+        if item not in special_tokens:
+            special_tokens.append(item)
+            special_tokens_set.add(item)
+    for item in sorted(preterminals, key=preterminals.get, reverse=True):
+        if item not in special_tokens:
+            special_tokens.append(item)
+            special_tokens_set.add(item)
+    for item in special_tokens:
+        f.write(item + '\n')
+    i = len(special_tokens)
+
+    for item in sorted(heads, key=heads.get, reverse=True):
+        if item in special_tokens_set:
+            continue
+        i += 1
+        f.write(item + '\n')
+    f.close()
diff --git a/scripts/training/rdlm/train_model_head.sh b/scripts/training/rdlm/train_model_head.sh
new file mode 100755
index 000000000..fdead9061
--- /dev/null
+++ b/scripts/training/rdlm/train_model_head.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+if [ $# -eq 2 ]; then
+    OUTFILE=$1
+    WORKDIR=$2
+else
+    echo "usage: $0 <outfile> <working_directory>"
+    exit 1
+fi
+
+NPLM=/path/to/nplm
+MOSES_ROOT=/path/to/mosesdecoder
+
+INFILE=/path/to/file/in/moses/xml/format
+VALIDATIONFILE=/path/to/file/in/moses/xml/format
+#TESTFILE1=/path/to/file/in/moses/xml/format
+#TESTFILE2=/path/to/file/in/moses/xml/format
+PREFIX=$(basename $OUTFILE)
+
+EPOCHS=2
+INPUT_VOCAB_SIZE=500000
+OUTPUT_VOCAB_SIZE=500000
+MINIBATCH_SIZE=1000
+NOISE=100
+HIDDEN=0
+INPUT_EMBEDDING=150
+OUTPUT_EMBEDDING=750
+THREADS=4
+MODE=head
+UP_CONTEXT=2
+LEFT_CONTEXT=3
+RIGHT_CONTEXT=0
+
+
+mkdir -p $WORKDIR
+
+python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
+
+head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
+head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output
+
+python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
+python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
+
+$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
+   --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
+   --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
+   --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
+   --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
+
+python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
+
+if [[ $TESTFILE1 ]]; then
+  python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
+  $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+fi
+
+if [[ $TESTFILE2 ]]; then
+  python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
+  $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+fi
diff --git a/scripts/training/rdlm/train_model_label.sh b/scripts/training/rdlm/train_model_label.sh
new file mode 100755
index 000000000..371c69a3b
--- /dev/null
+++ b/scripts/training/rdlm/train_model_label.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+if [ $# -eq 2 ]; then
+    OUTFILE=$1
+    WORKDIR=$2
+else
+    echo "usage: $0 <outfile> <working_directory>"
+    exit 1
+fi
+
+NPLM=/path/to/nplm
+MOSES_ROOT=/path/to/mosesdecoder
+
+INFILE=/path/to/file/in/moses/xml/format
+VALIDATIONFILE=/path/to/file/in/moses/xml/format
+#TESTFILE1=/path/to/file/in/moses/xml/format
+#TESTFILE2=/path/to/file/in/moses/xml/format
+PREFIX=$(basename $OUTFILE)
+
+EPOCHS=1
+INPUT_VOCAB_SIZE=500000
+OUTPUT_VOCAB_SIZE=75
+MINIBATCH_SIZE=1000
+NOISE=50
+HIDDEN=0
+INPUT_EMBEDDING=150
+OUTPUT_EMBEDDING=750
+THREADS=4
+MODE=label
+UP_CONTEXT=2
+LEFT_CONTEXT=3
+RIGHT_CONTEXT=0
+
+
+mkdir -p $WORKDIR
+
+python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
+
+head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
+cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals |
+    grep -v "^<null" |
+    grep -v "^<root" |
+    grep -v "^<start_head" |
+    grep -v "^<dummy" |
+    grep -v "^<head_head" |
+    grep -v "^<stop_head" |
+    head -n $OUTPUT_VOCAB_SIZE > $WORKDIR/vocab.output
+
+python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
+python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
+
+$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
+   --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
+   --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
+   --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
+   --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
+
+python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
+
+if [[ $TESTFILE1 ]]; then
+  python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
+  $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+fi
+
+if [[ $TESTFILE2 ]]; then
+  python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
+    --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
+  $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
+fi
\ No newline at end of file