From ac51e9f0a86314e430e5c1f95e3b8b5b91a2818e Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 18 Mar 2015 09:56:46 +0000 Subject: [PATCH 1/4] Always use "SyntaxInputWeight0" as name of SyntaxInputWeight feature --- scripts/training/train-model.perl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 265847c3d..f92e545be 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -2153,8 +2153,8 @@ sub create_ini { # SyntaxInputWeight FF if ($_USE_SYNTAX_INPUT_WEIGHT_FEATURE) { - $feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight$i\n"; - $weight_spec .= "SyntaxInputWeight$i= 0.1\n"; + $feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight0\n"; + $weight_spec .= "SyntaxInputWeight0= 0.1\n"; } # generation model From fc15e03ebe42d2e15c8be53eeda5b795fffb6e1e Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 18 Mar 2015 09:57:42 +0000 Subject: [PATCH 2/4] Replace truecase-egret.sh with more general tree-converter-wrapper.perl --- .../ems/support/tree-converter-wrapper.perl | 21 +++++++++++++++++++ scripts/recaser/truecase-egret.sh | 7 ------- 2 files changed, 21 insertions(+), 7 deletions(-) create mode 100755 scripts/ems/support/tree-converter-wrapper.perl delete mode 100755 scripts/recaser/truecase-egret.sh diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl new file mode 100755 index 000000000..aae55991a --- /dev/null +++ b/scripts/ems/support/tree-converter-wrapper.perl @@ -0,0 +1,21 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use utf8; +use Getopt::Long "GetOptions"; + +Getopt::Long::config("pass_through"); + +my ($BIN,$MODEL); + +&GetOptions('bin=s' => \$BIN, + 'model=s' => \$MODEL); # À la truecase.perl + +die("ERROR: specify at least --bin BIN!") unless defined($BIN); + +my $cmd = "$BIN"; +$cmd .= " -case true:model=$MODEL" if defined($MODEL); +$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass other args to $BIN. + +system $cmd; diff --git a/scripts/recaser/truecase-egret.sh b/scripts/recaser/truecase-egret.sh deleted file mode 100755 index 137d27e7d..000000000 --- a/scripts/recaser/truecase-egret.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env sh - -$1 \ - -input_format egret \ - -output_format egret \ - -no_egret_weight_normalization \ - -case true:model=$3 From 1568afb73741cf6a464d6efe0b46e4c4c9d9327d Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Wed, 18 Mar 2015 17:36:32 +0000 Subject: [PATCH 3/4] on-the-fly unbinarization of internal tree structure (for translation models extracted from binarized treebanks) --- moses/ChartKBestExtractor.cpp | 3 +- moses/FF/InternalTree.cpp | 38 +++++++++ moses/FF/InternalTree.h | 128 +++++++++++++++--------------- moses/FF/TreeStructureFeature.cpp | 18 +++++ moses/FF/TreeStructureFeature.h | 6 +- 5 files changed, 127 insertions(+), 66 deletions(-) diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp index 60e4e7f2b..bd5d7cbcd 100644 --- a/moses/ChartKBestExtractor.cpp +++ b/moses/ChartKBestExtractor.cpp @@ -168,9 +168,10 @@ TreePointer ChartKBestExtractor::GetOutputTree(const Derivation &d) } mytree->Combine(previous_trees); + mytree->Unbinarize(); return mytree; } else { - UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found"); + UTIL_THROW2("Error: k-best tree output active, but no internal tree structure found"); } } diff --git a/moses/FF/InternalTree.cpp b/moses/FF/InternalTree.cpp index 9e974d0cd..95730f018 100644 --- a/moses/FF/InternalTree.cpp +++ b/moses/FF/InternalTree.cpp @@ -115,6 +115,44 @@ void InternalTree::Combine(const std::vector &previous) } } +//take tree with virtual nodes (created with relax-parse --RightBinarize or --LeftBinarize) and reconstruct original tree. +void InternalTree::Unbinarize() +{ + + // nodes with virtual label cannot be unbinarized + if (m_value.empty() || m_value[0] == '^') { + return; + } + + //if node has child that is virtual node, get unbinarized list of children + for (std::vector::iterator it = m_children.begin(); it != m_children.end(); ++it) { + if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') { + std::vector new_children; + GetUnbinarizedChildren(new_children); + m_children = new_children; + break; + } + } + + //recursion + for (std::vector::iterator it = m_children.begin(); it != m_children.end(); ++it) { + (*it)->Unbinarize(); + } +} + +//get the children of a node in a binarized tree; if a child is virtual, (transitively) replace it with its children +void InternalTree::GetUnbinarizedChildren(std::vector &ret) const +{ + for (std::vector::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) { + const std::string &label = (*itx)->GetLabel(); + if (!label.empty() && label[0] == '^') { + (*itx)->GetUnbinarizedChildren(ret); + } + else { + ret.push_back(*itx); + } + } +} bool InternalTree::FlatSearch(const std::string & label, std::vector::const_iterator & it) const { diff --git a/moses/FF/InternalTree.h b/moses/FF/InternalTree.h index 722c5832f..f9a8ba5d8 100644 --- a/moses/FF/InternalTree.h +++ b/moses/FF/InternalTree.h @@ -38,6 +38,8 @@ public: std::string GetString(bool start = true) const; void Combine(const std::vector &previous); + void Unbinarize(); + void GetUnbinarizedChildren(std::vector &children) const; const std::string & GetLabel() const { return m_value; } @@ -93,6 +95,68 @@ public: // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node bool RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it, InternalTree const* &parent) const; + // Python-like generator that yields next nonterminal leaf on every call + $generator(leafNT) + { + std::vector::iterator it; + InternalTree* tree; + leafNT(InternalTree* root = 0): tree(root) {} + $emit(std::vector::iterator) + for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { + if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { + $yield(it); + } else if ((*it)->GetLength() > 0) { + if ((*it).get()) { // normal pointer to same object that TreePointer points to + $restart(tree = (*it).get()); + } + } + } + $stop; + }; + + + // Python-like generator that yields the parent of the next nonterminal leaf on every call + $generator(leafNTParent) + { + std::vector::iterator it; + InternalTree* tree; + leafNTParent(InternalTree* root = 0): tree(root) {} + $emit(InternalTree*) + for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { + if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { + $yield(tree); + } else if ((*it)->GetLength() > 0) { + if ((*it).get()) { + $restart(tree = (*it).get()); + } + } + } + $stop; + }; + + // Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal + $generator(leafNTPath) + { + std::vector::iterator it; + InternalTree* tree; + std::vector * path; + leafNTPath(InternalTree* root = NULL, std::vector * orig = NULL): tree(root), path(orig) {} + $emit(std::vector::iterator) + path->push_back(tree); + for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { + if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { + path->push_back((*it).get()); + $yield(it); + path->pop_back(); + } else if ((*it)->GetLength() > 0) { + if ((*it).get()) { + $restart(tree = (*it).get()); + } + } + } + path->pop_back(); + $stop; + }; }; @@ -113,68 +177,4 @@ public: }; }; -// Python-like generator that yields next nonterminal leaf on every call -$generator(leafNT) -{ - std::vector::iterator it; - InternalTree* tree; - leafNT(InternalTree* root = 0): tree(root) {} - $emit(std::vector::iterator) - for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { - if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { - $yield(it); - } else if ((*it)->GetLength() > 0) { - if ((*it).get()) { // normal pointer to same object that TreePointer points to - $restart(tree = (*it).get()); - } - } - } - $stop; -}; - - -// Python-like generator that yields the parent of the next nonterminal leaf on every call -$generator(leafNTParent) -{ - std::vector::iterator it; - InternalTree* tree; - leafNTParent(InternalTree* root = 0): tree(root) {} - $emit(InternalTree*) - for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { - if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { - $yield(tree); - } else if ((*it)->GetLength() > 0) { - if ((*it).get()) { - $restart(tree = (*it).get()); - } - } - } - $stop; -}; - -// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal -$generator(leafNTPath) -{ - std::vector::iterator it; - InternalTree* tree; - std::vector * path; - leafNTPath(InternalTree* root = NULL, std::vector * orig = NULL): tree(root), path(orig) {} - $emit(std::vector::iterator) - path->push_back(tree); - for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { - if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { - path->push_back((*it).get()); - $yield(it); - path->pop_back(); - } else if ((*it)->GetLength() > 0) { - if ((*it).get()) { - $restart(tree = (*it).get()); - } - } - } - path->pop_back(); - $stop; -}; - - } \ No newline at end of file diff --git a/moses/FF/TreeStructureFeature.cpp b/moses/FF/TreeStructureFeature.cpp index e558b06bc..f2988f2b9 100644 --- a/moses/FF/TreeStructureFeature.cpp +++ b/moses/FF/TreeStructureFeature.cpp @@ -70,6 +70,11 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy } mytree->Combine(previous_trees); + bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "")); + if (m_binarized && full_sentence) { + mytree->Unbinarize(); + } + return new TreeState(mytree); } else { UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found"); @@ -77,4 +82,17 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy } +void TreeStructureFeature::SetParameter(const std::string& key, const std::string& value) +{ + std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n"; + if (key == "tuneable") { + m_tuneable = Scan(value); + } else if (key == "filterable") { //ignore + } else if (key == "binarized") { // if trees have been binarized before learning translation model; output unbinarized trees + m_binarized = true; + } else { + UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value); + } +} + } diff --git a/moses/FF/TreeStructureFeature.h b/moses/FF/TreeStructureFeature.h index d5ec4edda..ecb2ce7cb 100644 --- a/moses/FF/TreeStructureFeature.h +++ b/moses/FF/TreeStructureFeature.h @@ -34,9 +34,11 @@ class TreeStructureFeature : public StatefulFeatureFunction { SyntaxConstraints* m_constraints; LabelSet* m_labelset; + bool m_binarized; public: TreeStructureFeature(const std::string &line) - :StatefulFeatureFunction(0, line) { + :StatefulFeatureFunction(0, line) + , m_binarized(false) { ReadParameters(); } ~TreeStructureFeature() { @@ -53,6 +55,8 @@ public: return true; } + void SetParameter(const std::string& key, const std::string& value); + void EvaluateInIsolation(const Phrase &source , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown From eab513b63527da05eee21a61a0c85c1a218c9e3e Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Wed, 18 Mar 2015 17:39:45 +0000 Subject: [PATCH 4/4] relational dependency language model --- moses/FF/Factory.cpp | 2 + moses/LM/Jamfile | 7 +- moses/LM/RDLM.cpp | 832 ++++++++++++++++++ moses/LM/RDLM.h | 245 ++++++ scripts/training/rdlm/README | 49 ++ .../training/rdlm/average_null_embedding.py | 45 + .../training/rdlm/extract_syntactic_ngrams.py | 262 ++++++ scripts/training/rdlm/extract_vocab.py | 169 ++++ scripts/training/rdlm/train_model_head.sh | 65 ++ scripts/training/rdlm/train_model_label.sh | 72 ++ 10 files changed, 1747 insertions(+), 1 deletion(-) create mode 100644 moses/LM/RDLM.cpp create mode 100644 moses/LM/RDLM.h create mode 100644 scripts/training/rdlm/README create mode 100755 scripts/training/rdlm/average_null_embedding.py create mode 100755 scripts/training/rdlm/extract_syntactic_ngrams.py create mode 100755 scripts/training/rdlm/extract_vocab.py create mode 100755 scripts/training/rdlm/train_model_head.sh create mode 100755 scripts/training/rdlm/train_model_label.sh diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 37a818c08..218e458ff 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -114,6 +114,7 @@ #ifdef LM_NEURAL #include "moses/LM/NeuralLMWrapper.h" +#include "moses/LM/RDLM.h" #include "moses/LM/bilingual-lm/BiLM_NPLM.h" #endif @@ -296,6 +297,7 @@ FeatureRegistry::FeatureRegistry() #endif #ifdef LM_NEURAL MOSES_FNAME2("NeuralLM", NeuralLMWrapper); + MOSES_FNAME(RDLM); MOSES_FNAME2("BilingualNPLM", BilingualLM_NPLM); #endif #ifdef LM_DALM diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile index 44bbecdd1..6dac9179f 100644 --- a/moses/LM/Jamfile +++ b/moses/LM/Jamfile @@ -90,8 +90,13 @@ if $(with-nplm) { lib nplm : : $(with-nplm)/lib $(with-nplm)/lib64 ; obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen ; obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen ; - alias neural : NeuralLMWrapper.o BiLM_NPLM.o nplm : : : -fopenmp -fopenmp LM_NEURAL ; + obj RDLM.o : RDLM.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen ; + alias neural : NeuralLMWrapper.o nplm : : : -fopenmp -fopenmp LM_NEURAL ; + alias bilinguallm : BiLM_NPLM.o nplm : : : -fopenmp -fopenmp LM_NEURAL ; + alias rdlm : RDLM.o nplm : : : -fopenmp -fopenmp LM_NEURAL ; dependencies += neural ; + dependencies += bilinguallm ; + dependencies += rdlm ; lmmacros += LM_NEURAL ; } diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp new file mode 100644 index 000000000..f531ade28 --- /dev/null +++ b/moses/LM/RDLM.cpp @@ -0,0 +1,832 @@ +#include "RDLM.h" +#include +#include "moses/StaticData.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/ChartHypothesis.h" +#include "moses/InputFileStream.h" +#include "moses/Util.h" +#include "util/exception.hh" +#include "neuralTM.h" + +namespace Moses +{ + +typedef Eigen::Map > EigenMap; + +RDLM::~RDLM() { + delete lm_head_base_instance_; + delete lm_label_base_instance_; +} + +void RDLM::Load() { + + lm_head_base_instance_ = new nplm::neuralTM(); + lm_head_base_instance_->read(m_path_head_lm); + + m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words(); +// std::cerr << "Does head RDLM share vocabulary for input/output? " << m_sharedVocab << std::endl; + + lm_label_base_instance_ = new nplm::neuralTM(); + lm_label_base_instance_->read(m_path_label_lm); + + if (m_premultiply) { + lm_head_base_instance_->premultiply(); + lm_label_base_instance_->premultiply(); + } + + lm_head_base_instance_->set_cache(m_cacheSize); + lm_label_base_instance_->set_cache(m_cacheSize); + + StaticData &staticData = StaticData::InstanceNonConst(); + if (staticData.GetTreeStructure() == NULL) { + staticData.SetTreeStructure(this); + } + + offset_up_head = 2*m_context_left + 2*m_context_right; + offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up; + + size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2; + size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1; + + UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(), + "Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head); + UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(), + "Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label); + + //get int value of commonly used tokens + static_head_null.resize(size_head); + for (unsigned int i = 0; i < size_head; i++) { + char numstr[20]; + sprintf(numstr, "", i); + static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr); + } + + static_label_null.resize(size_label); + for (unsigned int i = 0; i < size_label; i++) { + char numstr[20]; + sprintf(numstr, "", i); + static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr); + } + + static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head); + + static_start_head = lm_head_base_instance_->lookup_input_word(""); + static_start_label = lm_head_base_instance_->lookup_input_word(""); + + static_head_head = lm_head_base_instance_->lookup_input_word(""); + static_head_label = lm_head_base_instance_->lookup_input_word(""); + static_head_label_output = lm_label_base_instance_->lookup_output_word(""); + + static_stop_head = lm_head_base_instance_->lookup_input_word(""); + static_stop_label = lm_head_base_instance_->lookup_input_word(""); + static_stop_label_output = lm_label_base_instance_->lookup_output_word(""); + static_start_label_output = lm_label_base_instance_->lookup_output_word(""); + + static_root_head = lm_head_base_instance_->lookup_input_word(""); + static_root_label = lm_head_base_instance_->lookup_input_word(""); + + // just score provided file, then exit. + if (!m_debugPath.empty()) { + ScoreFile(m_debugPath); + exit(1); + } + +// { +// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA europäische]] [NN Zeit]]]")); +// TreePointer mytree3 (new InternalTree("[ADJA europäische]")); +// TreePointer mytree4 (new InternalTree("[pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA]] [NN Zeit]]]")); +// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred]]")); +// +// std::vector ancestor_heads; +// std::vector ancestor_labels; +// +// size_t boundary_hash(0); +// boost::array score; +// score.fill(0); +// std::cerr << "scoring: " << mytree3->GetString() << std::endl; +// std::vector previous_trees; +// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees); +// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// previous_trees.push_back(mytree3); +// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees); +// std::cerr << "scoring: " << mytree4->GetString() << std::endl; +// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// mytree4->Combine(previous_trees); +// previous_trees.clear(); +// previous_trees.push_back(mytree4); +// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees); +// std::cerr << "scoring: " << mytree2->GetString() << std::endl; +// +// score[1] = 0; +// score[3] = 0; +// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// score[0] = 0; +// score[1] = 0; +// score[2] = 0; +// score[3] = 0; +// std::cerr << "scoring: " << mytree->GetString() << std::endl; +// +// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// } +// UTIL_THROW2("Finished"); +// +// } +// +// { +// std::cerr << "BINARIZED\n\n"; +// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA europäische]] [NN Zeit]]]]]]")); +// TreePointer mytree3 (new InternalTree("[ADJA europäische]")); +// TreePointer mytree4 (new InternalTree("[^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA]] [NN Zeit]]]")); +// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred]]]]")); +// +// std::vector ancestor_heads; +// std::vector ancestor_labels; +// +// size_t boundary_hash(0); +// boost::array score; +// score.fill(0); +// std::cerr << "scoring: " << mytree3->GetString() << std::endl; +// std::vector previous_trees; +// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees); +// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// previous_trees.push_back(mytree3); +// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees); +// std::cerr << "scoring: " << mytree4->GetString() << std::endl; +// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// mytree4->Combine(previous_trees); +// previous_trees.clear(); +// previous_trees.push_back(mytree4); +// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees); +// std::cerr << "scoring: " << mytree2->GetString() << std::endl; +// +// score[1] = 0; +// score[3] = 0; +// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// score[0] = 0; +// score[1] = 0; +// score[2] = 0; +// score[3] = 0; +// std::cerr << "scoring: " << mytree->GetString() << std::endl; +// +// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl; +// +// } +// UTIL_THROW2("Finished"); + +} + + +void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array &score, std::vector &ancestor_heads, std::vector &ancestor_labels, size_t &boundary_hash, int num_virtual, int rescoring_levels) const +{ + + // ignore terminal nodes + if (root->IsTerminal()) { + return; + } + + // ignore glue rules + if (root->GetLabel() == m_glueSymbol) { + // recursion + for (std::vector::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) + { + Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels); + } + return; + } + + // ignore virtual nodes (in binarization; except if it's the root) + if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) { + // recursion + if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) { + root = back_pointers.find(root)->second.get(); + rescoring_levels = m_context_up-1; + } + for (std::vector::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) { + Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels); + } + return; + } + + // ignore start/end of sentence tags + if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) { + return; + } + + nplm::neuralTM *lm_head = lm_head_backend_.get(); + if (!lm_head) { + lm_head = new nplm::neuralTM(*lm_head_base_instance_); + lm_head->set_normalization(m_normalizeHeadLM); + lm_head->set_cache(m_cacheSize); + lm_head_backend_.reset(lm_head); + } + + // ignore preterminal node (except if we're scoring root nodes) + if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) { + // root of tree: score without context + if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) { + std::vector ngram_head_null (static_head_null); + ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel()); + if (m_isPretermBackoff && ngram_head_null.back() == 0) { + ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel()); + } + if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) { + std::vector::iterator it = ngram_head_null.begin(); + std::fill_n(it, m_context_left, static_start_head); + it += m_context_left; + std::fill_n(it, m_context_left, static_start_label); + it += m_context_left; + std::fill_n(it, m_context_right, static_stop_head); + it += m_context_right; + std::fill_n(it, m_context_right, static_stop_label); + it += m_context_right; + size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size()); + it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it); + it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it); + } + if (ancestor_labels.size() >= m_context_up && !num_virtual) { + score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size()))); + } + else { + boost::hash_combine(boundary_hash, ngram_head_null.back()); + score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size()))); + } + } + return; + // we only need to re-visit previous hypotheses if we have more context available. + } else if (root->IsLeafNT()) { + if (m_context_up > 1 && ancestor_heads.size()) { + root = back_pointers.find(root)->second.get(); + // ignore preterminal node + if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) { + return; + } + rescoring_levels = m_context_up-1; + } + else { + return; + } + } + + nplm::neuralTM *lm_label = lm_label_backend_.get(); + if (!lm_label) { + lm_label = new nplm::neuralTM(*lm_label_base_instance_); + lm_label->set_normalization(m_normalizeLabelLM); + lm_label->set_cache(m_cacheSize); + lm_label_backend_.reset(lm_label); + } + + std::pair head_ids; + InternalTree* found = GetHead(root, back_pointers, head_ids); + if (found == NULL) { + head_ids = std::make_pair(static_dummy_head, static_dummy_head); + } + + size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size()); + const std::string & head_label = root->GetLabel(); + bool virtual_head = false; + int reached_end = 0; + int label_idx, label_idx_out; + if (m_binarized && head_label[0] == '^') { + virtual_head = true; + if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) { + reached_end = 1; //indicate that we've seen the first symbol of the RHS + } + else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) { + reached_end = 2; // indicate that we've seen the last symbol of the RHS + } + // with 'full' binarization, direction is encoded in 2nd char + std::string clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1); + label_idx = lm_label->lookup_input_word(clipped_label); + label_idx_out = lm_label->lookup_output_word(clipped_label); + } + else { + reached_end = 3; // indicate that we've seen first and last symbol of the RHS + label_idx = lm_label->lookup_input_word(head_label); + label_idx_out = lm_label->lookup_output_word(head_label); + } + + int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first; + + // root of tree: score without context + if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) { + if (head_idx != static_dummy_head && head_idx != static_head_head) { + std::vector ngram_head_null (static_head_null); + *(ngram_head_null.end()-2) = label_idx; + ngram_head_null.back() = head_ids.second; + if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) { + std::vector::iterator it = ngram_head_null.begin(); + std::fill_n(it, m_context_left, static_start_head); + it += m_context_left; + std::fill_n(it, m_context_left, static_start_label); + it += m_context_left; + std::fill_n(it, m_context_right, static_stop_head); + it += m_context_right; + std::fill_n(it, m_context_right, static_stop_label); + it += m_context_right; + it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it); + it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it); + score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size()))); + } + else { + boost::hash_combine(boundary_hash, ngram_head_null.back()); + score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size()))); + } + } + std::vector ngram_label_null (static_label_null); + ngram_label_null.back() = label_idx_out; + if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) { + std::vector::iterator it = ngram_label_null.begin(); + std::fill_n(it, m_context_left, static_start_head); + it += m_context_left; + std::fill_n(it, m_context_left, static_start_label); + it += m_context_left; + std::fill_n(it, m_context_right, static_stop_head); + it += m_context_right; + std::fill_n(it, m_context_right, static_stop_label); + it += m_context_right; + it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it); + it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it); + score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size()))); + } + else { + boost::hash_combine(boundary_hash, ngram_label_null.back()); + score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size()))); + } + } + + ancestor_heads.push_back(head_idx); + ancestor_labels.push_back(label_idx); + + if (virtual_head) { + num_virtual = m_context_up; + } + else if (num_virtual) { + --num_virtual; + } + + + // fill ancestor context (same for all children) + if (context_up_nonempty < m_context_up) { + ++context_up_nonempty; + } + size_t up_padding = m_context_up - context_up_nonempty; + + std::vector ngram (static_label_null); + + std::vector::iterator it = ngram.begin() + offset_up_head; + if (up_padding > 0) { + it += up_padding; + } + + it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it); + + if (up_padding > 0) { + it += up_padding; + } + + it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it); + + // create vectors of head/label IDs of all children + int num_children = root->GetLength(); + + // get number of children after unbinarization + if (m_binarized) { + num_children = 0; + UnbinarizedChildren real_children(root, back_pointers, m_binarized); + for (std::vector::const_iterator it = real_children.begin(); it != real_children.end(); it = ++real_children) { + num_children++; + } + } + + if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++; //also predict start label + if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++; //also predict end label + + std::vector heads(num_children); + std::vector labels(num_children); + std::vector heads_output(num_children); + std::vector labels_output(num_children); + + GetChildHeadsAndLabels(root, back_pointers, reached_end, lm_head, lm_label, heads, labels, heads_output, labels_output); + + //left padding; only need to add this initially + if (reached_end == 1 || reached_end == 3) { + std::fill_n(ngram.begin(), m_context_left, static_start_head); + std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label); + } + size_t left_padding = m_context_left; + size_t left_offset = 0; + size_t right_offset = std::min(heads.size(), m_context_right + 1); + size_t right_padding = m_context_right + 1 - right_offset; + + // construct context of label model and predict label + for (size_t i = 0; i != heads.size(); i++) { + + std::vector::iterator it = ngram.begin(); + + if (left_padding > 0) { + it += left_padding; + } + + it = std::copy(heads.begin()+left_offset, heads.begin()+i, it); + + if (left_padding > 0) { + it += left_padding; + } + + it = std::copy(labels.begin()+left_offset, labels.begin()+i, it); + + it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it); + + if (right_padding > 0) { + if (reached_end == 2 || reached_end == 3) { + std::fill_n(it, right_padding, static_stop_head); + it += right_padding; + } + else { + std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it); + } + } + + it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it); + + if (right_padding > 0) { + if (reached_end == 2 || reached_end == 3) { + std::fill_n(it, right_padding, static_stop_label); + it += right_padding; + } + else { + std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it); + } + } + + ngram.back() = labels_output[i]; + + if (ancestor_labels.size() >= m_context_up && !num_virtual) { + score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); + } + else { + boost::hash_combine(boundary_hash, ngram.back()); + score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); + } + + // construct context of head model and predict head + if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) { + + ngram.back() = labels[i]; + ngram.push_back(heads_output[i]); + + if (ancestor_labels.size() >= m_context_up && !num_virtual) { + score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); + } + else { + boost::hash_combine(boundary_hash, ngram.back()); + score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); + } + ngram.pop_back(); + } + + // next time, we need to add less start symbol padding + if (left_padding) + left_padding--; + else + left_offset++; + + if (right_offset < heads.size()) + right_offset++; + else + right_padding++; + } + + + if (rescoring_levels == 1) { + ancestor_heads.pop_back(); + ancestor_labels.pop_back(); + return; + } + // recursion + for (std::vector::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) + { + Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels - 1); + } + ancestor_heads.pop_back(); + ancestor_labels.pop_back(); +} + +InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs, InternalTree* head_ptr) const +{ + InternalTree *tree; + + for (std::vector::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) + { + if ((*it)->IsLeafNT()) { + tree = back_pointers.find(it->get())->second.get(); + } + else { + tree = it->get(); + } + + if (m_binarized && tree->GetLabel()[0] == '^') { + head_ptr = GetHead(tree, back_pointers, IDs, head_ptr); + if (head_ptr != NULL && !m_isPTKVZ) { + return head_ptr; + } + } + + // assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head + // if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned + else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) { + head_ptr = tree; + if (!m_isPTKVZ) { + GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs); + return head_ptr; + } + } + + // add PTKVZ to lemma of verb + else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") { + InternalTree *tree2; + for (std::vector::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) { + if ((*it2)->IsLeafNT()) { + tree2 = back_pointers.find(it2->get())->second.get(); + } + else { + tree2 = it2->get(); + } + if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) { + std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel(); + GetIDs(verb, head_ptr->GetLabel(), IDs); + return head_ptr; + } + } + } + } + + if (head_ptr != NULL) { + GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs); + } + return head_ptr; +} + + +void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector & heads, std::vector & labels, std::vector & heads_output, std::vector & labels_output) const +{ + std::pair child_ids; + InternalTree* found; + size_t j = 0; + + // score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes) + if (m_context_right && (reached_end == 1 || reached_end == 3)) { + heads[j] = static_start_head; + labels[j] = static_start_label; + labels_output[j] = static_start_label_output; + j++; + } + + UnbinarizedChildren real_children(root, back_pointers, m_binarized); + + // extract head words / labels + for (std::vector::const_iterator itx = real_children.begin(); itx != real_children.end(); itx = ++real_children) { + if ((*itx)->IsTerminal()) { + std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl; + std::cerr << "children: "; + for (std::vector::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) { + std::cerr << (*itx2)->GetLabel() << " "; + } + std::cerr << std::endl; + // resize vectors (should we throw exception instead?) + heads.pop_back(); + labels.pop_back(); + heads_output.pop_back(); + labels_output.pop_back(); + continue; + } + InternalTree* child = itx->get(); + // also go through trees or previous hypotheses to rescore nodes for which more context has become available + if ((*itx)->IsLeafNT()) { + child = back_pointers.find(itx->get())->second.get(); + } + + // preterminal node + if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) { + heads[j] = static_head_head; + labels[j] = static_head_label; + labels_output[j] = static_head_label_output; + j++; + continue; + } + + found = GetHead(child, back_pointers, child_ids); + if (found == NULL) { + child_ids = std::make_pair(static_dummy_head, static_dummy_head); + } + + labels[j] = lm_head->lookup_input_word(child->GetLabel()); + labels_output[j] = lm_label->lookup_output_word(child->GetLabel()); + heads[j] = child_ids.first; + heads_output[j] = child_ids.second; + j++; + } + + // score end label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes) + if (m_context_left && (reached_end == 2 || reached_end == 3)) { + heads[j] = static_stop_head; + labels[j] = static_stop_label; + labels_output[j] = static_stop_label_output; + } +} + + +void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair & IDs) const +{ + IDs.first = lm_head_base_instance_->lookup_input_word(head); + if (m_isPretermBackoff && IDs.first == 0) { + IDs.first = lm_head_base_instance_->lookup_input_word(preterminal); + } + if (m_sharedVocab) { + IDs.second = IDs.first; + } + else { + IDs.second = lm_head_base_instance_->lookup_output_word(head); + if (m_isPretermBackoff && IDs.second == 0) { + IDs.second = lm_head_base_instance_->lookup_output_word(preterminal); + } + } +} + + +void RDLM::PrintInfo(std::vector &ngram, nplm::neuralTM* lm) const +{ + for (size_t i = 0; i < ngram.size()-1; i++) { + std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " "; + } + std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " "; + + for (size_t i = 0; i < ngram.size(); i++) { + std::cerr << ngram[i] << " "; + } + std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl; +} + + +RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector &previous) const +{ + + TreePointerMap ret; + std::vector::iterator it; + bool found = false; + InternalTree::leafNT next_leafNT(root); + for (std::vector::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { + found = next_leafNT(it); + if (found) { + ret[it->get()] = *it_prev; + } + else { + std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; + } + } + return ret; +} + +void RDLM::ScoreFile(std::string &path) +{ + InputFileStream inStream(path); + std::string line, null; + std::vector ancestor_heads(m_context_up, static_root_head); + std::vector ancestor_labels(m_context_up, static_root_label); + while(getline(inStream, line)) { + TreePointerMap back_pointers; + boost::array score; + score.fill(0); + InternalTree* mytree (new InternalTree(line)); + size_t boundary_hash = 0; + Score(mytree, back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); + std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl; + } +} + + +void RDLM::SetParameter(const std::string& key, const std::string& value) +{ + std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n"; + if (key == "tuneable") { + m_tuneable = Scan(value); + } else if (key == "filterable") { //ignore + } else if (key == "path_head_lm") { + m_path_head_lm = value; + } else if (key == "path_label_lm") { + m_path_label_lm = value; + } else if (key == "ptkvz") { + m_isPTKVZ = Scan(value); + } else if (key == "backoff") { + m_isPretermBackoff = Scan(value); + } else if (key == "context_up") { + m_context_up = Scan(value); + } else if (key == "context_left") { + m_context_left = Scan(value); + } else if (key == "context_right") { + m_context_right = Scan(value); + } else if (key == "debug_path") { + m_debugPath = value; + } else if (key == "premultiply") { + m_premultiply = Scan(value); + } else if (key == "rerank") { + m_rerank = Scan(value); + } else if (key == "normalize_head_lm") { + m_normalizeHeadLM = Scan(value); + } else if (key == "normalize_label_lm") { + m_normalizeLabelLM = Scan(value); + } else if (key == "binarized") { + if (value == "left") + m_binarized = 1; + else if (value == "right") + m_binarized = 2; + else if (value == "full") + m_binarized = 3; + else + UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value); + } else if (key == "glue_symbol") { + m_glueSymbol = value; + } else if (key == "cache_size") { + m_cacheSize = Scan(value); + } else { + UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value); + } +} + + +FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo + , int featureID /* used to index the state in the previous hypotheses */ + , ScoreComponentCollection* accumulator) const +{ + if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) { + const std::string *tree = property->GetValueString(); + TreePointer mytree (boost::make_shared(*tree)); + + //get subtrees (in target order) + std::vector previous_trees; + float prev_approx_head = 0, prev_approx_label = 0; //approximated (due to lack of context) LM costs from previous hypos + for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { + const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos); + if (word.IsNonTerminal()) { + size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; + const RDLMState* prev = static_cast(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID)); + previous_trees.push_back(prev->GetTree()); + prev_approx_head -= prev->GetApproximateScoreHead(); + prev_approx_label -= prev->GetApproximateScoreLabel(); + } + } + size_t ff_idx = accumulator->GetIndexes(this).first; + + accumulator->PlusEquals(ff_idx, prev_approx_head); + accumulator->PlusEquals(ff_idx+1, prev_approx_label); + + bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag)); + std::vector ancestor_heads ((full_sentence ? m_context_up : 0), static_root_head); + std::vector ancestor_labels ((full_sentence ? m_context_up : 0), static_root_label); + ancestor_heads.reserve(10); + ancestor_labels.reserve(10); + + TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees); + boost::array score; // score_head, approx_score_head, score_label, approx_score_label + score.fill(0); + //hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning. + size_t boundary_hash = 0; + if (!m_rerank) { + Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); + accumulator->PlusEquals(ff_idx, score[0] + score[1]); + accumulator->PlusEquals(ff_idx+1, score[2] + score[3]); + } + mytree->Combine(previous_trees); + if (m_rerank && full_sentence) { + Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); + accumulator->PlusEquals(ff_idx, score[0] + score[1]); + accumulator->PlusEquals(ff_idx+1, score[2] + score[3]); + } + if (m_binarized && full_sentence) { + mytree->Unbinarize(); + } + + return new RDLMState(mytree, score[1], score[3], boundary_hash); + } + else { + UTIL_THROW2("Error: RDLM active, but no internal tree structure found"); + } + +} + +} diff --git a/moses/LM/RDLM.h b/moses/LM/RDLM.h new file mode 100644 index 000000000..8ae49ce76 --- /dev/null +++ b/moses/LM/RDLM.h @@ -0,0 +1,245 @@ +#include +#include +#include "moses/FF/StatefulFeatureFunction.h" +#include "moses/FF/FFState.h" +#include "moses/FF/InternalTree.h" + +#include +#include + +// relational dependency language model, described in: +// Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. Transactions of the Association for Computational Linguistics. +// see 'scripts/training/rdlm' for training scripts + +namespace nplm { + class neuralTM; +} + +namespace Moses +{ + +class RDLMState : public TreeState +{ + float m_approx_head; //score that was approximated due to lack of context + float m_approx_label; + size_t m_hash; +public: + RDLMState(TreePointer tree, float approx_head, float approx_label, size_t hash) + : TreeState(tree) + , m_approx_head(approx_head) + , m_approx_label(approx_label) + , m_hash(hash) + {} + + float GetApproximateScoreHead() const { + return m_approx_head; + } + + float GetApproximateScoreLabel() const { + return m_approx_label; + } + + size_t GetHash() const { + return m_hash; + } + + int Compare(const FFState& other) const { + if (m_hash == static_cast(&other)->GetHash()) return 0; + else if (m_hash > static_cast(&other)->GetHash()) return 1; + else return -1; + } +}; + +class RDLM : public StatefulFeatureFunction +{ + typedef std::map TreePointerMap; + + nplm::neuralTM* lm_head_base_instance_; + mutable boost::thread_specific_ptr lm_head_backend_; + + nplm::neuralTM* lm_label_base_instance_; + mutable boost::thread_specific_ptr lm_label_backend_; + + std::string dummy_head; + std::string m_glueSymbol; + std::string m_startSymbol; + std::string m_endSymbol; + std::string m_endTag; + std::string m_path_head_lm; + std::string m_path_label_lm; + bool m_isPTKVZ; + bool m_isPretermBackoff; + size_t m_context_left; + size_t m_context_right; + size_t m_context_up; + bool m_premultiply; + bool m_rerank; + bool m_normalizeHeadLM; + bool m_normalizeLabelLM; + bool m_sharedVocab; + std::string m_debugPath; // score all trees in the provided file, then exit + int m_binarized; + int m_cacheSize; + + size_t offset_up_head; + size_t offset_up_label; + + size_t size_head; + size_t size_label; + std::vector static_label_null; + std::vector static_head_null; + int static_dummy_head; + int static_start_head; + int static_start_label; + int static_stop_head; + int static_stop_label; + int static_head_head; + int static_head_label; + int static_root_head; + int static_root_label; + + int static_head_label_output; + int static_stop_label_output; + int static_start_label_output; + +public: + RDLM(const std::string &line) + : StatefulFeatureFunction(2, line) + , dummy_head("") + , m_glueSymbol("Q") + , m_startSymbol("SSTART") + , m_endSymbol("SEND") + , m_endTag("") + , m_isPTKVZ(false) + , m_isPretermBackoff(true) + , m_context_left(3) + , m_context_right(0) + , m_context_up(2) + , m_premultiply(true) + , m_rerank(false) + , m_normalizeHeadLM(false) + , m_normalizeLabelLM(false) + , m_sharedVocab(false) + , m_binarized(0) + , m_cacheSize(1000000) + { + ReadParameters(); + } + + ~RDLM(); + + virtual const FFState* EmptyHypothesisState(const InputType &input) const { + return new RDLMState(TreePointer(), 0, 0, 0); + } + + void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array &score, std::vector &ancestor_heads, std::vector &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const; + InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs, InternalTree * head_ptr=NULL) const; + void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector & heads, std::vector & labels, std::vector & heads_output, std::vector & labels_output) const; + void GetIDs(const std::string & head, const std::string & preterminal, std::pair & IDs) const; + void ScoreFile(std::string &path); //for debugging + void PrintInfo(std::vector &ngram, nplm::neuralTM* lm) const; //for debugging + + TreePointerMap AssociateLeafNTs(InternalTree* root, const std::vector &previous) const; + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void SetParameter(const std::string& key, const std::string& value); + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const {}; + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const {}; + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const {}; + FFState* EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const {UTIL_THROW(util::Exception, "Not implemented");}; + FFState* EvaluateWhenApplied( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const; + + void Load(); + + // Iterator-class that yields all children of a node; if child is virtual node of binarized tree, its children are yielded instead. + class UnbinarizedChildren + { + private: + std::vector::const_iterator iter; + std::vector::const_iterator _begin; + std::vector::const_iterator _end; + InternalTree* current; + const TreePointerMap & back_pointers; + bool binarized; + std::vector::const_iterator> > stack; + + public: + UnbinarizedChildren(InternalTree* root, const TreePointerMap & pointers, bool binary): + current(root), + back_pointers(pointers), + binarized(binary) + { + stack.reserve(10); + _end = current->GetChildren().end(); + iter = current->GetChildren().begin(); + // expand virtual node + while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') { + stack.push_back(std::make_pair(current, iter)); + // also go through trees or previous hypotheses to rescore nodes for which more context has become available + if ((*iter)->IsLeafNT()) { + current = back_pointers.find(iter->get())->second.get(); + } + else { + current = iter->get(); + } + iter = current->GetChildren().begin(); + } + _begin = iter; + } + + std::vector::const_iterator begin() const { return _begin; } + std::vector::const_iterator end() const { return _end; } + + std::vector::const_iterator operator++() { + iter++; + if (iter == current->GetChildren().end()) { + while (!stack.empty()) { + std::pair::const_iterator> & active = stack.back(); + current = active.first; + iter = ++active.second; + stack.pop_back(); + if (iter != current->GetChildren().end()) { + break; + } + } + if (iter == _end) { + return iter; + } + } + // expand virtual node + while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') { + stack.push_back(std::make_pair(current, iter)); + // also go through trees or previous hypotheses to rescore nodes for which more context has become available + if ((*iter)->IsLeafNT()) { + current = back_pointers.find(iter->get())->second.get(); + } + else { + current = iter->get(); + } + iter = current->GetChildren().begin(); + } + return iter; + } + }; + +}; + +} diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README new file mode 100644 index 000000000..347e71f6d --- /dev/null +++ b/scripts/training/rdlm/README @@ -0,0 +1,49 @@ +RDLM: relational dependency language model +------------------------------------------ + +This is a language model for the string-to-tree decoder with a dependency grammar. +It should work with any corpus with projective dependency annotation in ConLL format, +converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py +It depends on NPLM for neural network training and querying. + +Prerequisites +------------- + +Install NPLM and compile moses with it. See the instructions in the Moses documentation for details: + + http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel + +Training +-------- + +RDLM is designed for string-to-tree decoding with dependency annotation on the target side. +If you have such a system, you can train RDLM on the target side of the same parallel corpus +that is used for training the translation model. + +To train the model on additional monolingual data, or test it on some held-out test/dev data, +parse and process it in the same way that the parallel corpus has been processed. +This includes tokenization, parsing, truecasing, compound splitting etc. + +RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh` +set the paths to NPLM, Moses, and the training/test files in the respective files, then execute: + + ./train_model_head.sh rdlm_head.nnlm working_dir_head + ./train_model_label.sh rdlm_label.nnlm working_dir_label + + +Decoding +-------- + +To use RDLM during decoding, add the following line to your moses.ini config: + + [feature] + RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0 + + [weight] + RDLM 0.1 0.1 + +Reference +--------- + +Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. + Transactions of the Association for Computational Linguistics. diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py new file mode 100755 index 000000000..cb67c9d75 --- /dev/null +++ b/scripts/training/rdlm/average_null_embedding.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# average embeddings of special null words for RDLM. +# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL + +import sys +import os +import numpy + +def load_model(model_file): + return nplm.NeuralLM.from_file(model_file) + +def get_weights(path, vocab, len_context): + d = [[0]*vocab for i in range(len_context)] + for line in open(path): + for i, word in enumerate(line.split()[:-1]): + d[i][int(word)] += 1 + return d + +if __name__ == "__main__": + + nplm_path = sys.argv[1] + model_input = sys.argv[2] + training_instances = sys.argv[3] + model_output = sys.argv[4] + + sys.path.append(os.path.join(nplm_path,'python')) + import nplm + + model = load_model(model_input) + + len_context = len(open(training_instances).readline().split())-1 + + sys.stderr.write('reading ngrams...') + weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context)) + sys.stderr.write('done\n') + + for i in range(len_context): + index = model.word_to_index_input[''.format(i)] + model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0) + sys.stderr.write('writing model...') + model.to_file(open(model_output,'w')) + sys.stderr.write('done\n') diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py new file mode 100755 index 000000000..12d62d1e6 --- /dev/null +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM +# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py +# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 () + +from __future__ import print_function, unicode_literals, division +import sys +import codecs +import io +import argparse + +try: + from lxml import etree as ET +except ImportError: + from xml.etree import cElementTree as ET + +def parse_arguments(): + parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM") + + parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)', + choices=['label', 'head'], required=True) + parser.add_argument('--vocab', metavar='PATH', type=str, required=True, + help='input layer vocabulary file (one item per line; first line \'\')') + parser.add_argument('--output_vocab', metavar='PATH', type=str, + help='output layer vocabulary file (default: use input layer vocabulary)') + parser.add_argument('--left_context', metavar='INT', type=int, + help='size of context vector for left siblings (default: %(default)s)', default=3) + parser.add_argument('--right_context', metavar='INT', type=int, + help='size of context vector for right siblings (default: %(default)s)', default=0) + parser.add_argument('--up_context', metavar='INT', type=int, + help='size of context vector for ancestors (default: %(default)s)', default=2) + parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q', + help='glue symbol. Will be skipped during extraction (default: %(default)s)') + parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART', + help='sentence start symbol. Will be skipped during extraction (default: %(default)s)') + parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND', + help='sentence end symbol. Will be skipped during extraction (default: %(default)s)') + parser.add_argument('--ptkvz', action='store_true', + help='special rule for German dependency trees: concatenate separable verb prefix and verb') + return parser.parse_args() + +def escape_text(s): + + s = s.replace('|','|') # factor separator + s = s.replace('[','[') # syntax non-terminal + s = s.replace(']',']') # syntax non-terminal + s = s.replace('\'',''') # xml special character + s = s.replace('"','"') # xml special character + return s + +# deterministic heuristic to get head of subtree +def get_head(xml, add_ptkvz): + head = None + preterminal = None + for child in xml: + if not len(child): + if head is not None: + continue + preterminal = child.get('label') + head = escape_text(child.text.strip()) + + elif add_ptkvz and head and child.get('label') == 'avz': + for grandchild in child: + if grandchild.get('label') == 'PTKVZ': + head = escape_text(grandchild.text.strip()) + head + break + + return head, preterminal + +def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None): + + if len(xml): + + # skip glue rules + if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol: + for child in xml: + get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels) + return + + # skip virtual nodes + if xml.get('label') == '' or xml.get('label') == '': + return + + if not parent_heads: + parent_heads = [vocab.get('', 0)] * options.up_context + parent_labels = [vocab.get('', 0)] * options.up_context + + head, preterminal = get_head(xml, options.ptkvz) + if not head: + head = '' + preterminal = head + elif head not in vocab: + head = preterminal + + label = xml.get('label') + + # syntactic n-gram for root node + int_list = [] + int_list.extend([start_head_idx] * options.left_context) + int_list.extend([start_label_idx] * options.left_context) + int_list.extend([stop_head_idx] * options.right_context) + int_list.extend([stop_label_idx] * options.right_context) + int_list.extend(parent_heads) + int_list.extend(parent_labels) + + if options.mode == 'label': + int_list.append(output_vocab.get(label, 0)) + sys.stdout.write(' '.join(map(str, int_list)) + '\n') + elif options.mode == 'head' and not head == '': + int_list.append(vocab.get(label, 0)) + int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0))) + sys.stdout.write(' '.join(map(str, int_list)) + '\n') + + parent_heads.append(vocab.get(head, 0)) + parent_labels.append(vocab.get(label, 0)) + + # virtual start/end-of-subtree tag + if len(xml) > 0: + if options.right_context: + start = ET.Element('tree') + start2 = ET.Element('tree') + start.set('label','') + start2.set('label','XY') + start2.text = '' + start.append(start2) + xml.insert(0,start) + if options.left_context: + end = ET.Element('tree') + end2 = ET.Element('tree') + end.set('label','') + end2.set('label','XY') + end2.text = '' + end.append(end2) + xml.append(end) + + + heads = [] + preterminals = [] + labels = [] + + for child in xml: + if not len(child): + # mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent) + head_child = '' + preterminal_child = head_child + child_label = '' + else: + head_child, preterminal_child = get_head(child, options.ptkvz) + child_label = child.get('label') + + if head_child is None: + head_child = '' + + heads.append(head_child) + preterminals.append(preterminal_child) + labels.append(child_label) + + heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))] + labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))] + + #ancestor context is same for all children + up_heads = parent_heads[-options.up_context:] + up_labels = parent_labels[-options.up_context:] + + for i,child in enumerate(xml): + + # skip some special symbols, but recursively extract n-grams for its children + if options.mode == 'head' and (heads[i] == '' or heads[i] == '' or heads[i] == '' or heads[i] == ''): + parent_heads.append(vocab.get(heads[i], 0)) + parent_labels.append(vocab.get(labels[i], 0)) + get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels) + parent_heads.pop() + parent_labels.pop() + continue + + previous_heads = heads_idx[max(0,i-options.left_context):i] + previous_labels = labels_idx[max(0,i-options.left_context):i] + + subsequent_heads = heads_idx[i+1:i+options.right_context+1] + subsequent_labels = labels_idx[i+1:i+options.right_context+1] + + if len(previous_heads) < options.left_context: + previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads + previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels + + if len(subsequent_heads) < options.right_context: + subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads)) + subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels)) + + int_list = [] + int_list.extend(previous_heads) + int_list.extend(previous_labels) + int_list.extend(subsequent_heads) + int_list.extend(subsequent_labels) + int_list.extend(up_heads) + int_list.extend(up_labels) + if options.mode == 'label': + int_list.append(output_vocab.get(labels[i], 0)) + elif options.mode == 'head': + int_list.append(vocab.get(labels[i], 0)) + int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0))) + + sys.stdout.write(' '.join(map(str, int_list)) + '\n') + + parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0))) + parent_labels.append(vocab.get(labels[i], 0)) + + get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels) + + parent_heads.pop() + parent_labels.pop() + + +def load_vocab(path): + v = {} + for i,line in enumerate(io.open(path, encoding="UTF-8")): + v[line.strip()] = i + return v + +if __name__ == '__main__': + + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + + options = parse_arguments() + + vocab = load_vocab(options.vocab) + + if options.output_vocab is None: + sys.stderr.write('no output vocabulary specified; using input vocabulary\n') + output_vocab = vocab + else: + output_vocab = load_vocab(options.output_vocab) + + start_head_idx = vocab.get("", 0) + start_label_idx = vocab.get("", 0) + stop_head_idx = vocab.get("", 0) + stop_label_idx = vocab.get("", 0) + + i = 0 + for line in sys.stdin: + if i and not i % 50000: + sys.stderr.write('.') + if i and not i % 1000000: + sys.stderr.write('{0}\n'.format(i)) + if sys.version_info < (3, 0): + if line == b'\n': + continue + # hack for older moses versions with inconsistent encoding of "|" + line = line.replace(b'&bar;', b'|') + else: + if line == '\n': + continue + # hack for older moses versions with inconsistent encoding of "|" + line = line.replace('&bar;', '|') + xml = ET.fromstring(line) + get_syntactic_ngrams(xml, options, vocab, output_vocab) + i += 1 diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py new file mode 100755 index 000000000..684fdcd32 --- /dev/null +++ b/scripts/training/rdlm/extract_vocab.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# extract 5 vocabulary files from parsed corpus in moses XML format + +from __future__ import print_function, unicode_literals, division +import sys +import codecs +import io +import argparse +from collections import Counter + +try: + from lxml import etree as ET +except ImportError: + from xml.etree import cElementTree as ET + +def parse_arguments(): + + help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n" + help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n"; + help_text += " [PREFIX].preterminals: preterminal symbols\n"; + help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n"; + help_text += " [PREFIX].terminals: terminal symbols\n"; + help_text += " [PREFIX].all: all of the above\n" + + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text) + + parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', + help='input text (default: standard input).') + parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX', + help='output prefix (default: "vocab")') + parser.add_argument('--ptkvz', action="store_true", + help='special rule for German dependency trees: attach separable verb prefixes to verb') + + args = parser.parse_args() + + return args + +def escape_text(s): + + s = s.replace('|','|') # factor separator + s = s.replace('[','[') # syntax non-terminal + s = s.replace(']',']') # syntax non-terminal + s = s.replace('\'',''') # xml special character + s = s.replace('"','"') # xml special character + return s + +# deterministic heuristic to get head of subtree +def get_head(xml): + head = None + preterminal = None + for child in xml: + if not len(child): + if head is not None: + continue + preterminal = child.get('label') + head = escape_text(child.text.strip()) + + # hack for split compounds + elif child[-1].get('label') == 'SEGMENT': + return escape_text(child[-1].text.strip()), 'SEGMENT' + + elif args.ptkvz and head and child.get('label') == 'avz': + for grandchild in child: + if grandchild.get('label') == 'PTKVZ': + head = escape_text(grandchild.text.strip()) + head + break + + return head, preterminal + +def get_vocab(xml): + + if len(xml): + + head, preterminal = get_head(xml) + if not head: + head = '' + preterminal = '' + + heads[head] += 1 + preterminals[preterminal] += 1 + + label = xml.get('label') + + nonterminals[label] += 1 + + for child in xml: + if not len(child): + continue + get_vocab(child) + + + +if __name__ == '__main__': + + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + args = parse_arguments() + + heads = Counter() + preterminals = Counter() + nonterminals = Counter() + + i = 0 + for line in args.input: + if i and not i % 50000: + sys.stderr.write('.') + if i and not i % 1000000: + sys.stderr.write('{0}\n'.format(i)) + if line == '\n': + continue + + # hack for older moses versions with inconsistent encoding of "|" + line = line.replace('&bar;', '|') + + xml = ET.fromstring(line) + get_vocab(xml) + i += 1 + + special_tokens = ['', '', '', '', '', '', '', '', '', '', '', '', ''] + + for i in range(30): + special_tokens.append(''.format(i)) + + f = io.open(args.output + '.special', 'w', encoding='UTF-8') + for item in special_tokens: + f.write(item + '\n') + f.close() + + f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8') + for item in sorted(preterminals, key=preterminals.get, reverse=True): + f.write(item + '\n') + f.close() + + f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8') + for item in sorted(nonterminals, key=nonterminals.get, reverse=True): + f.write(item + '\n') + f.close() + + f = io.open(args.output + '.terminals', 'w', encoding='UTF-8') + for item in sorted(heads, key=heads.get, reverse=True): + f.write(item + '\n') + f.close() + + f = io.open(args.output + '.all', 'w', encoding='UTF-8') + special_tokens_set = set(special_tokens) + for item in sorted(nonterminals, key=nonterminals.get, reverse=True): + if item not in special_tokens: + special_tokens.append(item) + special_tokens_set.add(item) + for item in sorted(preterminals, key=preterminals.get, reverse=True): + if item not in special_tokens: + special_tokens.append(item) + special_tokens_set.add(item) + for item in special_tokens: + f.write(item + '\n') + i = len(special_tokens) + + for item in sorted(heads, key=heads.get, reverse=True): + if item in special_tokens_set: + continue + i += 1 + f.write(item + '\n') + f.close() diff --git a/scripts/training/rdlm/train_model_head.sh b/scripts/training/rdlm/train_model_head.sh new file mode 100755 index 000000000..fdead9061 --- /dev/null +++ b/scripts/training/rdlm/train_model_head.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +if [ $# -eq 2 ]; then + OUTFILE=$1 + WORKDIR=$2 +else + echo "usage: $0 " + exit 1 +fi + +NPLM=/path/to/nplm +MOSES_ROOT=/path/to/mosesdecoder + +INFILE=/path/to/file/in/moses/xml/format +VALIDATIONFILE=/path/to/file/in/moses/xml/format +#TESTFILE1=/path/to/file/in/moses/xml/format +#TESTFILE2=/path/to/file/in/moses/xml/format +PREFIX=$(basename $OUTFILE) + +EPOCHS=2 +INPUT_VOCAB_SIZE=500000 +OUTPUT_VOCAB_SIZE=500000 +MINIBATCH_SIZE=1000 +NOISE=100 +HIDDEN=0 +INPUT_EMBEDDING=150 +OUTPUT_EMBEDDING=750 +THREADS=4 +MODE=head +UP_CONTEXT=2 +LEFT_CONTEXT=3 +RIGHT_CONTEXT=0 + + +mkdir -p $WORKDIR + +python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1 + +head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input +head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output + +python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1 +python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1 + +$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \ + --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \ + --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \ + --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \ + --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1 + +python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1 + +if [[ $TESTFILE1 ]]; then + python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1 + $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 +fi + +if [[ $TESTFILE2 ]]; then + python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1 + $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 +fi diff --git a/scripts/training/rdlm/train_model_label.sh b/scripts/training/rdlm/train_model_label.sh new file mode 100755 index 000000000..371c69a3b --- /dev/null +++ b/scripts/training/rdlm/train_model_label.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +if [ $# -eq 2 ]; then + OUTFILE=$1 + WORKDIR=$2 +else + echo "usage: $0 " + exit 1 +fi + +NPLM=/path/to/nplm +MOSES_ROOT=/path/to/mosesdecoder + +INFILE=/path/to/file/in/moses/xml/format +VALIDATIONFILE=/path/to/file/in/moses/xml/format +#TESTFILE1=/path/to/file/in/moses/xml/format +#TESTFILE2=/path/to/file/in/moses/xml/format +PREFIX=$(basename $OUTFILE) + +EPOCHS=1 +INPUT_VOCAB_SIZE=500000 +OUTPUT_VOCAB_SIZE=75 +MINIBATCH_SIZE=1000 +NOISE=50 +HIDDEN=0 +INPUT_EMBEDDING=150 +OUTPUT_EMBEDDING=750 +THREADS=4 +MODE=label +UP_CONTEXT=2 +LEFT_CONTEXT=3 +RIGHT_CONTEXT=0 + + +mkdir -p $WORKDIR + +python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1 + +head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input +cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals | + grep -v "^ $WORKDIR/vocab.output + +python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1 +python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1 + +$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \ + --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \ + --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \ + --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \ + --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1 + +python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1 + +if [[ $TESTFILE1 ]]; then + python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1 + $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 +fi + +if [[ $TESTFILE2 ]]; then + python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ + --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1 + $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 +fi \ No newline at end of file