mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 21:03:22 +03:00
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
This commit is contained in:
commit
6f52340c2b
@ -168,9 +168,10 @@ TreePointer ChartKBestExtractor::GetOutputTree(const Derivation &d)
|
||||
}
|
||||
|
||||
mytree->Combine(previous_trees);
|
||||
mytree->Unbinarize();
|
||||
return mytree;
|
||||
} else {
|
||||
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
|
||||
UTIL_THROW2("Error: k-best tree output active, but no internal tree structure found");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -114,6 +114,7 @@
|
||||
|
||||
#ifdef LM_NEURAL
|
||||
#include "moses/LM/NeuralLMWrapper.h"
|
||||
#include "moses/LM/RDLM.h"
|
||||
#include "moses/LM/bilingual-lm/BiLM_NPLM.h"
|
||||
#endif
|
||||
|
||||
@ -296,6 +297,7 @@ FeatureRegistry::FeatureRegistry()
|
||||
#endif
|
||||
#ifdef LM_NEURAL
|
||||
MOSES_FNAME2("NeuralLM", NeuralLMWrapper);
|
||||
MOSES_FNAME(RDLM);
|
||||
MOSES_FNAME2("BilingualNPLM", BilingualLM_NPLM);
|
||||
#endif
|
||||
#ifdef LM_DALM
|
||||
|
@ -115,6 +115,44 @@ void InternalTree::Combine(const std::vector<TreePointer> &previous)
|
||||
}
|
||||
}
|
||||
|
||||
//take tree with virtual nodes (created with relax-parse --RightBinarize or --LeftBinarize) and reconstruct original tree.
|
||||
void InternalTree::Unbinarize()
|
||||
{
|
||||
|
||||
// nodes with virtual label cannot be unbinarized
|
||||
if (m_value.empty() || m_value[0] == '^') {
|
||||
return;
|
||||
}
|
||||
|
||||
//if node has child that is virtual node, get unbinarized list of children
|
||||
for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') {
|
||||
std::vector<TreePointer> new_children;
|
||||
GetUnbinarizedChildren(new_children);
|
||||
m_children = new_children;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//recursion
|
||||
for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
(*it)->Unbinarize();
|
||||
}
|
||||
}
|
||||
|
||||
//get the children of a node in a binarized tree; if a child is virtual, (transitively) replace it with its children
|
||||
void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
|
||||
{
|
||||
for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) {
|
||||
const std::string &label = (*itx)->GetLabel();
|
||||
if (!label.empty() && label[0] == '^') {
|
||||
(*itx)->GetUnbinarizedChildren(ret);
|
||||
}
|
||||
else {
|
||||
ret.push_back(*itx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
|
@ -38,6 +38,8 @@ public:
|
||||
|
||||
std::string GetString(bool start = true) const;
|
||||
void Combine(const std::vector<TreePointer> &previous);
|
||||
void Unbinarize();
|
||||
void GetUnbinarizedChildren(std::vector<TreePointer> &children) const;
|
||||
const std::string & GetLabel() const {
|
||||
return m_value;
|
||||
}
|
||||
@ -93,6 +95,68 @@ public:
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
// Python-like generator that yields next nonterminal leaf on every call
|
||||
$generator(leafNT)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNT(InternalTree* root = 0): tree(root) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(it);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
|
||||
// Python-like generator that yields the parent of the next nonterminal leaf on every call
|
||||
$generator(leafNTParent)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNTParent(InternalTree* root = 0): tree(root) {}
|
||||
$emit(InternalTree*)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(tree);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) {
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
|
||||
$generator(leafNTPath)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
std::vector<InternalTree*> * path;
|
||||
leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
path->push_back(tree);
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
path->push_back((*it).get());
|
||||
$yield(it);
|
||||
path->pop_back();
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) {
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
}
|
||||
path->pop_back();
|
||||
$stop;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
@ -113,68 +177,4 @@ public:
|
||||
};
|
||||
};
|
||||
|
||||
// Python-like generator that yields next nonterminal leaf on every call
|
||||
$generator(leafNT)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNT(InternalTree* root = 0): tree(root) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(it);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
|
||||
// Python-like generator that yields the parent of the next nonterminal leaf on every call
|
||||
$generator(leafNTParent)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNTParent(InternalTree* root = 0): tree(root) {}
|
||||
$emit(InternalTree*)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(tree);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) {
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
|
||||
$generator(leafNTPath)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
std::vector<InternalTree*> * path;
|
||||
leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
path->push_back(tree);
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
path->push_back((*it).get());
|
||||
$yield(it);
|
||||
path->pop_back();
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) {
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
}
|
||||
path->pop_back();
|
||||
$stop;
|
||||
};
|
||||
|
||||
|
||||
}
|
@ -70,6 +70,11 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
|
||||
}
|
||||
mytree->Combine(previous_trees);
|
||||
|
||||
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "</s>" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "</s>"));
|
||||
if (m_binarized && full_sentence) {
|
||||
mytree->Unbinarize();
|
||||
}
|
||||
|
||||
return new TreeState(mytree);
|
||||
} else {
|
||||
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
|
||||
@ -77,4 +82,17 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
|
||||
|
||||
}
|
||||
|
||||
void TreeStructureFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
|
||||
if (key == "tuneable") {
|
||||
m_tuneable = Scan<bool>(value);
|
||||
} else if (key == "filterable") { //ignore
|
||||
} else if (key == "binarized") { // if trees have been binarized before learning translation model; output unbinarized trees
|
||||
m_binarized = true;
|
||||
} else {
|
||||
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -34,9 +34,11 @@ class TreeStructureFeature : public StatefulFeatureFunction
|
||||
{
|
||||
SyntaxConstraints* m_constraints;
|
||||
LabelSet* m_labelset;
|
||||
bool m_binarized;
|
||||
public:
|
||||
TreeStructureFeature(const std::string &line)
|
||||
:StatefulFeatureFunction(0, line) {
|
||||
:StatefulFeatureFunction(0, line)
|
||||
, m_binarized(false) {
|
||||
ReadParameters();
|
||||
}
|
||||
~TreeStructureFeature() {
|
||||
@ -53,6 +55,8 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
|
@ -90,8 +90,13 @@ if $(with-nplm) {
|
||||
lib nplm : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
|
||||
obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
|
||||
obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
|
||||
alias neural : NeuralLMWrapper.o BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
|
||||
obj RDLM.o : RDLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
|
||||
alias neural : NeuralLMWrapper.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
|
||||
alias bilinguallm : BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
|
||||
alias rdlm : RDLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
|
||||
dependencies += neural ;
|
||||
dependencies += bilinguallm ;
|
||||
dependencies += rdlm ;
|
||||
lmmacros += LM_NEURAL ;
|
||||
}
|
||||
|
||||
|
832
moses/LM/RDLM.cpp
Normal file
832
moses/LM/RDLM.cpp
Normal file
@ -0,0 +1,832 @@
|
||||
#include "RDLM.h"
|
||||
#include <vector>
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/ChartHypothesis.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/Util.h"
|
||||
#include "util/exception.hh"
|
||||
#include "neuralTM.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
typedef Eigen::Map<Eigen::Matrix<int,Eigen::Dynamic,1> > EigenMap;
|
||||
|
||||
RDLM::~RDLM() {
|
||||
delete lm_head_base_instance_;
|
||||
delete lm_label_base_instance_;
|
||||
}
|
||||
|
||||
void RDLM::Load() {
|
||||
|
||||
lm_head_base_instance_ = new nplm::neuralTM();
|
||||
lm_head_base_instance_->read(m_path_head_lm);
|
||||
|
||||
m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words();
|
||||
// std::cerr << "Does head RDLM share vocabulary for input/output? " << m_sharedVocab << std::endl;
|
||||
|
||||
lm_label_base_instance_ = new nplm::neuralTM();
|
||||
lm_label_base_instance_->read(m_path_label_lm);
|
||||
|
||||
if (m_premultiply) {
|
||||
lm_head_base_instance_->premultiply();
|
||||
lm_label_base_instance_->premultiply();
|
||||
}
|
||||
|
||||
lm_head_base_instance_->set_cache(m_cacheSize);
|
||||
lm_label_base_instance_->set_cache(m_cacheSize);
|
||||
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
if (staticData.GetTreeStructure() == NULL) {
|
||||
staticData.SetTreeStructure(this);
|
||||
}
|
||||
|
||||
offset_up_head = 2*m_context_left + 2*m_context_right;
|
||||
offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up;
|
||||
|
||||
size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2;
|
||||
size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1;
|
||||
|
||||
UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(),
|
||||
"Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head);
|
||||
UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(),
|
||||
"Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label);
|
||||
|
||||
//get int value of commonly used tokens
|
||||
static_head_null.resize(size_head);
|
||||
for (unsigned int i = 0; i < size_head; i++) {
|
||||
char numstr[20];
|
||||
sprintf(numstr, "<null_%d>", i);
|
||||
static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr);
|
||||
}
|
||||
|
||||
static_label_null.resize(size_label);
|
||||
for (unsigned int i = 0; i < size_label; i++) {
|
||||
char numstr[20];
|
||||
sprintf(numstr, "<null_%d>", i);
|
||||
static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
|
||||
}
|
||||
|
||||
static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head);
|
||||
|
||||
static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
|
||||
static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
|
||||
|
||||
static_head_head = lm_head_base_instance_->lookup_input_word("<head_head>");
|
||||
static_head_label = lm_head_base_instance_->lookup_input_word("<head_label>");
|
||||
static_head_label_output = lm_label_base_instance_->lookup_output_word("<head_label>");
|
||||
|
||||
static_stop_head = lm_head_base_instance_->lookup_input_word("<stop_head>");
|
||||
static_stop_label = lm_head_base_instance_->lookup_input_word("<stop_label>");
|
||||
static_stop_label_output = lm_label_base_instance_->lookup_output_word("<stop_label>");
|
||||
static_start_label_output = lm_label_base_instance_->lookup_output_word("<start_label>");
|
||||
|
||||
static_root_head = lm_head_base_instance_->lookup_input_word("<root_head>");
|
||||
static_root_label = lm_head_base_instance_->lookup_input_word("<root_label>");
|
||||
|
||||
// just score provided file, then exit.
|
||||
if (!m_debugPath.empty()) {
|
||||
ScoreFile(m_debugPath);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// {
|
||||
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA europäische]] [NN Zeit]]]"));
|
||||
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
|
||||
// TreePointer mytree4 (new InternalTree("[pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA]] [NN Zeit]]]"));
|
||||
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred]]"));
|
||||
//
|
||||
// std::vector<int> ancestor_heads;
|
||||
// std::vector<int> ancestor_labels;
|
||||
//
|
||||
// size_t boundary_hash(0);
|
||||
// boost::array<float, 4> score;
|
||||
// score.fill(0);
|
||||
// std::cerr << "scoring: " << mytree3->GetString() << std::endl;
|
||||
// std::vector<TreePointer> previous_trees;
|
||||
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
|
||||
// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// previous_trees.push_back(mytree3);
|
||||
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
|
||||
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
|
||||
// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// mytree4->Combine(previous_trees);
|
||||
// previous_trees.clear();
|
||||
// previous_trees.push_back(mytree4);
|
||||
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
|
||||
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
|
||||
//
|
||||
// score[1] = 0;
|
||||
// score[3] = 0;
|
||||
// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// score[0] = 0;
|
||||
// score[1] = 0;
|
||||
// score[2] = 0;
|
||||
// score[3] = 0;
|
||||
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
|
||||
//
|
||||
// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// }
|
||||
// UTIL_THROW2("Finished");
|
||||
//
|
||||
// }
|
||||
//
|
||||
// {
|
||||
// std::cerr << "BINARIZED\n\n";
|
||||
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA europäische]] [NN Zeit]]]]]]"));
|
||||
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
|
||||
// TreePointer mytree4 (new InternalTree("[^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA]] [NN Zeit]]]"));
|
||||
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred]]]]"));
|
||||
//
|
||||
// std::vector<int> ancestor_heads;
|
||||
// std::vector<int> ancestor_labels;
|
||||
//
|
||||
// size_t boundary_hash(0);
|
||||
// boost::array<float, 4> score;
|
||||
// score.fill(0);
|
||||
// std::cerr << "scoring: " << mytree3->GetString() << std::endl;
|
||||
// std::vector<TreePointer> previous_trees;
|
||||
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
|
||||
// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// previous_trees.push_back(mytree3);
|
||||
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
|
||||
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
|
||||
// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// mytree4->Combine(previous_trees);
|
||||
// previous_trees.clear();
|
||||
// previous_trees.push_back(mytree4);
|
||||
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
|
||||
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
|
||||
//
|
||||
// score[1] = 0;
|
||||
// score[3] = 0;
|
||||
// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// score[0] = 0;
|
||||
// score[1] = 0;
|
||||
// score[2] = 0;
|
||||
// score[3] = 0;
|
||||
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
|
||||
//
|
||||
// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
||||
//
|
||||
// }
|
||||
// UTIL_THROW2("Finished");
|
||||
|
||||
}
|
||||
|
||||
|
||||
void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float, 4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual, int rescoring_levels) const
|
||||
{
|
||||
|
||||
// ignore terminal nodes
|
||||
if (root->IsTerminal()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// ignore glue rules
|
||||
if (root->GetLabel() == m_glueSymbol) {
|
||||
// recursion
|
||||
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
|
||||
{
|
||||
Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// ignore virtual nodes (in binarization; except if it's the root)
|
||||
if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) {
|
||||
// recursion
|
||||
if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
|
||||
root = back_pointers.find(root)->second.get();
|
||||
rescoring_levels = m_context_up-1;
|
||||
}
|
||||
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
|
||||
Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// ignore start/end of sentence tags
|
||||
if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) {
|
||||
return;
|
||||
}
|
||||
|
||||
nplm::neuralTM *lm_head = lm_head_backend_.get();
|
||||
if (!lm_head) {
|
||||
lm_head = new nplm::neuralTM(*lm_head_base_instance_);
|
||||
lm_head->set_normalization(m_normalizeHeadLM);
|
||||
lm_head->set_cache(m_cacheSize);
|
||||
lm_head_backend_.reset(lm_head);
|
||||
}
|
||||
|
||||
// ignore preterminal node (except if we're scoring root nodes)
|
||||
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
|
||||
// root of tree: score without context
|
||||
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
|
||||
std::vector<int> ngram_head_null (static_head_null);
|
||||
ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel());
|
||||
if (m_isPretermBackoff && ngram_head_null.back() == 0) {
|
||||
ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel());
|
||||
}
|
||||
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
|
||||
std::vector<int>::iterator it = ngram_head_null.begin();
|
||||
std::fill_n(it, m_context_left, static_start_head);
|
||||
it += m_context_left;
|
||||
std::fill_n(it, m_context_left, static_start_label);
|
||||
it += m_context_left;
|
||||
std::fill_n(it, m_context_right, static_stop_head);
|
||||
it += m_context_right;
|
||||
std::fill_n(it, m_context_right, static_stop_label);
|
||||
it += m_context_right;
|
||||
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
|
||||
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
|
||||
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
|
||||
}
|
||||
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
|
||||
score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
|
||||
}
|
||||
else {
|
||||
boost::hash_combine(boundary_hash, ngram_head_null.back());
|
||||
score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
|
||||
}
|
||||
}
|
||||
return;
|
||||
// we only need to re-visit previous hypotheses if we have more context available.
|
||||
} else if (root->IsLeafNT()) {
|
||||
if (m_context_up > 1 && ancestor_heads.size()) {
|
||||
root = back_pointers.find(root)->second.get();
|
||||
// ignore preterminal node
|
||||
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
|
||||
return;
|
||||
}
|
||||
rescoring_levels = m_context_up-1;
|
||||
}
|
||||
else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
nplm::neuralTM *lm_label = lm_label_backend_.get();
|
||||
if (!lm_label) {
|
||||
lm_label = new nplm::neuralTM(*lm_label_base_instance_);
|
||||
lm_label->set_normalization(m_normalizeLabelLM);
|
||||
lm_label->set_cache(m_cacheSize);
|
||||
lm_label_backend_.reset(lm_label);
|
||||
}
|
||||
|
||||
std::pair<int,int> head_ids;
|
||||
InternalTree* found = GetHead(root, back_pointers, head_ids);
|
||||
if (found == NULL) {
|
||||
head_ids = std::make_pair(static_dummy_head, static_dummy_head);
|
||||
}
|
||||
|
||||
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
|
||||
const std::string & head_label = root->GetLabel();
|
||||
bool virtual_head = false;
|
||||
int reached_end = 0;
|
||||
int label_idx, label_idx_out;
|
||||
if (m_binarized && head_label[0] == '^') {
|
||||
virtual_head = true;
|
||||
if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) {
|
||||
reached_end = 1; //indicate that we've seen the first symbol of the RHS
|
||||
}
|
||||
else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) {
|
||||
reached_end = 2; // indicate that we've seen the last symbol of the RHS
|
||||
}
|
||||
// with 'full' binarization, direction is encoded in 2nd char
|
||||
std::string clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1);
|
||||
label_idx = lm_label->lookup_input_word(clipped_label);
|
||||
label_idx_out = lm_label->lookup_output_word(clipped_label);
|
||||
}
|
||||
else {
|
||||
reached_end = 3; // indicate that we've seen first and last symbol of the RHS
|
||||
label_idx = lm_label->lookup_input_word(head_label);
|
||||
label_idx_out = lm_label->lookup_output_word(head_label);
|
||||
}
|
||||
|
||||
int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first;
|
||||
|
||||
// root of tree: score without context
|
||||
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
|
||||
if (head_idx != static_dummy_head && head_idx != static_head_head) {
|
||||
std::vector<int> ngram_head_null (static_head_null);
|
||||
*(ngram_head_null.end()-2) = label_idx;
|
||||
ngram_head_null.back() = head_ids.second;
|
||||
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
|
||||
std::vector<int>::iterator it = ngram_head_null.begin();
|
||||
std::fill_n(it, m_context_left, static_start_head);
|
||||
it += m_context_left;
|
||||
std::fill_n(it, m_context_left, static_start_label);
|
||||
it += m_context_left;
|
||||
std::fill_n(it, m_context_right, static_stop_head);
|
||||
it += m_context_right;
|
||||
std::fill_n(it, m_context_right, static_stop_label);
|
||||
it += m_context_right;
|
||||
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
|
||||
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
|
||||
score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
|
||||
}
|
||||
else {
|
||||
boost::hash_combine(boundary_hash, ngram_head_null.back());
|
||||
score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
|
||||
}
|
||||
}
|
||||
std::vector<int> ngram_label_null (static_label_null);
|
||||
ngram_label_null.back() = label_idx_out;
|
||||
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
|
||||
std::vector<int>::iterator it = ngram_label_null.begin();
|
||||
std::fill_n(it, m_context_left, static_start_head);
|
||||
it += m_context_left;
|
||||
std::fill_n(it, m_context_left, static_start_label);
|
||||
it += m_context_left;
|
||||
std::fill_n(it, m_context_right, static_stop_head);
|
||||
it += m_context_right;
|
||||
std::fill_n(it, m_context_right, static_stop_label);
|
||||
it += m_context_right;
|
||||
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
|
||||
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
|
||||
score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size())));
|
||||
}
|
||||
else {
|
||||
boost::hash_combine(boundary_hash, ngram_label_null.back());
|
||||
score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size())));
|
||||
}
|
||||
}
|
||||
|
||||
ancestor_heads.push_back(head_idx);
|
||||
ancestor_labels.push_back(label_idx);
|
||||
|
||||
if (virtual_head) {
|
||||
num_virtual = m_context_up;
|
||||
}
|
||||
else if (num_virtual) {
|
||||
--num_virtual;
|
||||
}
|
||||
|
||||
|
||||
// fill ancestor context (same for all children)
|
||||
if (context_up_nonempty < m_context_up) {
|
||||
++context_up_nonempty;
|
||||
}
|
||||
size_t up_padding = m_context_up - context_up_nonempty;
|
||||
|
||||
std::vector<int> ngram (static_label_null);
|
||||
|
||||
std::vector<int>::iterator it = ngram.begin() + offset_up_head;
|
||||
if (up_padding > 0) {
|
||||
it += up_padding;
|
||||
}
|
||||
|
||||
it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it);
|
||||
|
||||
if (up_padding > 0) {
|
||||
it += up_padding;
|
||||
}
|
||||
|
||||
it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it);
|
||||
|
||||
// create vectors of head/label IDs of all children
|
||||
int num_children = root->GetLength();
|
||||
|
||||
// get number of children after unbinarization
|
||||
if (m_binarized) {
|
||||
num_children = 0;
|
||||
UnbinarizedChildren real_children(root, back_pointers, m_binarized);
|
||||
for (std::vector<TreePointer>::const_iterator it = real_children.begin(); it != real_children.end(); it = ++real_children) {
|
||||
num_children++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++; //also predict start label
|
||||
if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++; //also predict end label
|
||||
|
||||
std::vector<int> heads(num_children);
|
||||
std::vector<int> labels(num_children);
|
||||
std::vector<int> heads_output(num_children);
|
||||
std::vector<int> labels_output(num_children);
|
||||
|
||||
GetChildHeadsAndLabels(root, back_pointers, reached_end, lm_head, lm_label, heads, labels, heads_output, labels_output);
|
||||
|
||||
//left padding; only need to add this initially
|
||||
if (reached_end == 1 || reached_end == 3) {
|
||||
std::fill_n(ngram.begin(), m_context_left, static_start_head);
|
||||
std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label);
|
||||
}
|
||||
size_t left_padding = m_context_left;
|
||||
size_t left_offset = 0;
|
||||
size_t right_offset = std::min(heads.size(), m_context_right + 1);
|
||||
size_t right_padding = m_context_right + 1 - right_offset;
|
||||
|
||||
// construct context of label model and predict label
|
||||
for (size_t i = 0; i != heads.size(); i++) {
|
||||
|
||||
std::vector<int>::iterator it = ngram.begin();
|
||||
|
||||
if (left_padding > 0) {
|
||||
it += left_padding;
|
||||
}
|
||||
|
||||
it = std::copy(heads.begin()+left_offset, heads.begin()+i, it);
|
||||
|
||||
if (left_padding > 0) {
|
||||
it += left_padding;
|
||||
}
|
||||
|
||||
it = std::copy(labels.begin()+left_offset, labels.begin()+i, it);
|
||||
|
||||
it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it);
|
||||
|
||||
if (right_padding > 0) {
|
||||
if (reached_end == 2 || reached_end == 3) {
|
||||
std::fill_n(it, right_padding, static_stop_head);
|
||||
it += right_padding;
|
||||
}
|
||||
else {
|
||||
std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it);
|
||||
}
|
||||
}
|
||||
|
||||
it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it);
|
||||
|
||||
if (right_padding > 0) {
|
||||
if (reached_end == 2 || reached_end == 3) {
|
||||
std::fill_n(it, right_padding, static_stop_label);
|
||||
it += right_padding;
|
||||
}
|
||||
else {
|
||||
std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it);
|
||||
}
|
||||
}
|
||||
|
||||
ngram.back() = labels_output[i];
|
||||
|
||||
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
|
||||
score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
||||
}
|
||||
else {
|
||||
boost::hash_combine(boundary_hash, ngram.back());
|
||||
score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
||||
}
|
||||
|
||||
// construct context of head model and predict head
|
||||
if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) {
|
||||
|
||||
ngram.back() = labels[i];
|
||||
ngram.push_back(heads_output[i]);
|
||||
|
||||
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
|
||||
score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
||||
}
|
||||
else {
|
||||
boost::hash_combine(boundary_hash, ngram.back());
|
||||
score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
||||
}
|
||||
ngram.pop_back();
|
||||
}
|
||||
|
||||
// next time, we need to add less start symbol padding
|
||||
if (left_padding)
|
||||
left_padding--;
|
||||
else
|
||||
left_offset++;
|
||||
|
||||
if (right_offset < heads.size())
|
||||
right_offset++;
|
||||
else
|
||||
right_padding++;
|
||||
}
|
||||
|
||||
|
||||
if (rescoring_levels == 1) {
|
||||
ancestor_heads.pop_back();
|
||||
ancestor_labels.pop_back();
|
||||
return;
|
||||
}
|
||||
// recursion
|
||||
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
|
||||
{
|
||||
Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels - 1);
|
||||
}
|
||||
ancestor_heads.pop_back();
|
||||
ancestor_labels.pop_back();
|
||||
}
|
||||
|
||||
InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree* head_ptr) const
|
||||
{
|
||||
InternalTree *tree;
|
||||
|
||||
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
|
||||
{
|
||||
if ((*it)->IsLeafNT()) {
|
||||
tree = back_pointers.find(it->get())->second.get();
|
||||
}
|
||||
else {
|
||||
tree = it->get();
|
||||
}
|
||||
|
||||
if (m_binarized && tree->GetLabel()[0] == '^') {
|
||||
head_ptr = GetHead(tree, back_pointers, IDs, head_ptr);
|
||||
if (head_ptr != NULL && !m_isPTKVZ) {
|
||||
return head_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
// assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head
|
||||
// if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned
|
||||
else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) {
|
||||
head_ptr = tree;
|
||||
if (!m_isPTKVZ) {
|
||||
GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
|
||||
return head_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
// add PTKVZ to lemma of verb
|
||||
else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") {
|
||||
InternalTree *tree2;
|
||||
for (std::vector<TreePointer>::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) {
|
||||
if ((*it2)->IsLeafNT()) {
|
||||
tree2 = back_pointers.find(it2->get())->second.get();
|
||||
}
|
||||
else {
|
||||
tree2 = it2->get();
|
||||
}
|
||||
if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) {
|
||||
std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel();
|
||||
GetIDs(verb, head_ptr->GetLabel(), IDs);
|
||||
return head_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (head_ptr != NULL) {
|
||||
GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
|
||||
}
|
||||
return head_ptr;
|
||||
}
|
||||
|
||||
|
||||
void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const
|
||||
{
|
||||
std::pair<int,int> child_ids;
|
||||
InternalTree* found;
|
||||
size_t j = 0;
|
||||
|
||||
// score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
|
||||
if (m_context_right && (reached_end == 1 || reached_end == 3)) {
|
||||
heads[j] = static_start_head;
|
||||
labels[j] = static_start_label;
|
||||
labels_output[j] = static_start_label_output;
|
||||
j++;
|
||||
}
|
||||
|
||||
UnbinarizedChildren real_children(root, back_pointers, m_binarized);
|
||||
|
||||
// extract head words / labels
|
||||
for (std::vector<TreePointer>::const_iterator itx = real_children.begin(); itx != real_children.end(); itx = ++real_children) {
|
||||
if ((*itx)->IsTerminal()) {
|
||||
std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl;
|
||||
std::cerr << "children: ";
|
||||
for (std::vector<TreePointer>::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) {
|
||||
std::cerr << (*itx2)->GetLabel() << " ";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
// resize vectors (should we throw exception instead?)
|
||||
heads.pop_back();
|
||||
labels.pop_back();
|
||||
heads_output.pop_back();
|
||||
labels_output.pop_back();
|
||||
continue;
|
||||
}
|
||||
InternalTree* child = itx->get();
|
||||
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
|
||||
if ((*itx)->IsLeafNT()) {
|
||||
child = back_pointers.find(itx->get())->second.get();
|
||||
}
|
||||
|
||||
// preterminal node
|
||||
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
|
||||
heads[j] = static_head_head;
|
||||
labels[j] = static_head_label;
|
||||
labels_output[j] = static_head_label_output;
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
|
||||
found = GetHead(child, back_pointers, child_ids);
|
||||
if (found == NULL) {
|
||||
child_ids = std::make_pair(static_dummy_head, static_dummy_head);
|
||||
}
|
||||
|
||||
labels[j] = lm_head->lookup_input_word(child->GetLabel());
|
||||
labels_output[j] = lm_label->lookup_output_word(child->GetLabel());
|
||||
heads[j] = child_ids.first;
|
||||
heads_output[j] = child_ids.second;
|
||||
j++;
|
||||
}
|
||||
|
||||
// score end label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
|
||||
if (m_context_left && (reached_end == 2 || reached_end == 3)) {
|
||||
heads[j] = static_stop_head;
|
||||
labels[j] = static_stop_label;
|
||||
labels_output[j] = static_stop_label_output;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const
|
||||
{
|
||||
IDs.first = lm_head_base_instance_->lookup_input_word(head);
|
||||
if (m_isPretermBackoff && IDs.first == 0) {
|
||||
IDs.first = lm_head_base_instance_->lookup_input_word(preterminal);
|
||||
}
|
||||
if (m_sharedVocab) {
|
||||
IDs.second = IDs.first;
|
||||
}
|
||||
else {
|
||||
IDs.second = lm_head_base_instance_->lookup_output_word(head);
|
||||
if (m_isPretermBackoff && IDs.second == 0) {
|
||||
IDs.second = lm_head_base_instance_->lookup_output_word(preterminal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void RDLM::PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const
|
||||
{
|
||||
for (size_t i = 0; i < ngram.size()-1; i++) {
|
||||
std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " ";
|
||||
}
|
||||
std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " ";
|
||||
|
||||
for (size_t i = 0; i < ngram.size(); i++) {
|
||||
std::cerr << ngram[i] << " ";
|
||||
}
|
||||
std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl;
|
||||
}
|
||||
|
||||
|
||||
RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const
|
||||
{
|
||||
|
||||
TreePointerMap ret;
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
InternalTree::leafNT next_leafNT(root);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
ret[it->get()] = *it_prev;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void RDLM::ScoreFile(std::string &path)
|
||||
{
|
||||
InputFileStream inStream(path);
|
||||
std::string line, null;
|
||||
std::vector<int> ancestor_heads(m_context_up, static_root_head);
|
||||
std::vector<int> ancestor_labels(m_context_up, static_root_label);
|
||||
while(getline(inStream, line)) {
|
||||
TreePointerMap back_pointers;
|
||||
boost::array<float, 4> score;
|
||||
score.fill(0);
|
||||
InternalTree* mytree (new InternalTree(line));
|
||||
size_t boundary_hash = 0;
|
||||
Score(mytree, back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void RDLM::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
|
||||
if (key == "tuneable") {
|
||||
m_tuneable = Scan<bool>(value);
|
||||
} else if (key == "filterable") { //ignore
|
||||
} else if (key == "path_head_lm") {
|
||||
m_path_head_lm = value;
|
||||
} else if (key == "path_label_lm") {
|
||||
m_path_label_lm = value;
|
||||
} else if (key == "ptkvz") {
|
||||
m_isPTKVZ = Scan<bool>(value);
|
||||
} else if (key == "backoff") {
|
||||
m_isPretermBackoff = Scan<bool>(value);
|
||||
} else if (key == "context_up") {
|
||||
m_context_up = Scan<size_t>(value);
|
||||
} else if (key == "context_left") {
|
||||
m_context_left = Scan<size_t>(value);
|
||||
} else if (key == "context_right") {
|
||||
m_context_right = Scan<size_t>(value);
|
||||
} else if (key == "debug_path") {
|
||||
m_debugPath = value;
|
||||
} else if (key == "premultiply") {
|
||||
m_premultiply = Scan<bool>(value);
|
||||
} else if (key == "rerank") {
|
||||
m_rerank = Scan<bool>(value);
|
||||
} else if (key == "normalize_head_lm") {
|
||||
m_normalizeHeadLM = Scan<bool>(value);
|
||||
} else if (key == "normalize_label_lm") {
|
||||
m_normalizeLabelLM = Scan<bool>(value);
|
||||
} else if (key == "binarized") {
|
||||
if (value == "left")
|
||||
m_binarized = 1;
|
||||
else if (value == "right")
|
||||
m_binarized = 2;
|
||||
else if (value == "full")
|
||||
m_binarized = 3;
|
||||
else
|
||||
UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
|
||||
} else if (key == "glue_symbol") {
|
||||
m_glueSymbol = value;
|
||||
} else if (key == "cache_size") {
|
||||
m_cacheSize = Scan<int>(value);
|
||||
} else {
|
||||
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
|
||||
, int featureID /* used to index the state in the previous hypotheses */
|
||||
, ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
|
||||
const std::string *tree = property->GetValueString();
|
||||
TreePointer mytree (boost::make_shared<InternalTree>(*tree));
|
||||
|
||||
//get subtrees (in target order)
|
||||
std::vector<TreePointer> previous_trees;
|
||||
float prev_approx_head = 0, prev_approx_label = 0; //approximated (due to lack of context) LM costs from previous hypos
|
||||
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
|
||||
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
|
||||
if (word.IsNonTerminal()) {
|
||||
size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
|
||||
const RDLMState* prev = static_cast<const RDLMState*>(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID));
|
||||
previous_trees.push_back(prev->GetTree());
|
||||
prev_approx_head -= prev->GetApproximateScoreHead();
|
||||
prev_approx_label -= prev->GetApproximateScoreLabel();
|
||||
}
|
||||
}
|
||||
size_t ff_idx = accumulator->GetIndexes(this).first;
|
||||
|
||||
accumulator->PlusEquals(ff_idx, prev_approx_head);
|
||||
accumulator->PlusEquals(ff_idx+1, prev_approx_label);
|
||||
|
||||
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag));
|
||||
std::vector<int> ancestor_heads ((full_sentence ? m_context_up : 0), static_root_head);
|
||||
std::vector<int> ancestor_labels ((full_sentence ? m_context_up : 0), static_root_label);
|
||||
ancestor_heads.reserve(10);
|
||||
ancestor_labels.reserve(10);
|
||||
|
||||
TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees);
|
||||
boost::array<float, 4> score; // score_head, approx_score_head, score_label, approx_score_label
|
||||
score.fill(0);
|
||||
//hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning.
|
||||
size_t boundary_hash = 0;
|
||||
if (!m_rerank) {
|
||||
Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
accumulator->PlusEquals(ff_idx, score[0] + score[1]);
|
||||
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
|
||||
}
|
||||
mytree->Combine(previous_trees);
|
||||
if (m_rerank && full_sentence) {
|
||||
Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
|
||||
accumulator->PlusEquals(ff_idx, score[0] + score[1]);
|
||||
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
|
||||
}
|
||||
if (m_binarized && full_sentence) {
|
||||
mytree->Unbinarize();
|
||||
}
|
||||
|
||||
return new RDLMState(mytree, score[1], score[3], boundary_hash);
|
||||
}
|
||||
else {
|
||||
UTIL_THROW2("Error: RDLM active, but no internal tree structure found");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
245
moses/LM/RDLM.h
Normal file
245
moses/LM/RDLM.h
Normal file
@ -0,0 +1,245 @@
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "moses/FF/StatefulFeatureFunction.h"
|
||||
#include "moses/FF/FFState.h"
|
||||
#include "moses/FF/InternalTree.h"
|
||||
|
||||
#include <boost/thread/tss.hpp>
|
||||
#include <boost/array.hpp>
|
||||
|
||||
// relational dependency language model, described in:
|
||||
// Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. Transactions of the Association for Computational Linguistics.
|
||||
// see 'scripts/training/rdlm' for training scripts
|
||||
|
||||
namespace nplm {
|
||||
class neuralTM;
|
||||
}
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class RDLMState : public TreeState
|
||||
{
|
||||
float m_approx_head; //score that was approximated due to lack of context
|
||||
float m_approx_label;
|
||||
size_t m_hash;
|
||||
public:
|
||||
RDLMState(TreePointer tree, float approx_head, float approx_label, size_t hash)
|
||||
: TreeState(tree)
|
||||
, m_approx_head(approx_head)
|
||||
, m_approx_label(approx_label)
|
||||
, m_hash(hash)
|
||||
{}
|
||||
|
||||
float GetApproximateScoreHead() const {
|
||||
return m_approx_head;
|
||||
}
|
||||
|
||||
float GetApproximateScoreLabel() const {
|
||||
return m_approx_label;
|
||||
}
|
||||
|
||||
size_t GetHash() const {
|
||||
return m_hash;
|
||||
}
|
||||
|
||||
int Compare(const FFState& other) const {
|
||||
if (m_hash == static_cast<const RDLMState*>(&other)->GetHash()) return 0;
|
||||
else if (m_hash > static_cast<const RDLMState*>(&other)->GetHash()) return 1;
|
||||
else return -1;
|
||||
}
|
||||
};
|
||||
|
||||
class RDLM : public StatefulFeatureFunction
|
||||
{
|
||||
typedef std::map<InternalTree*,TreePointer> TreePointerMap;
|
||||
|
||||
nplm::neuralTM* lm_head_base_instance_;
|
||||
mutable boost::thread_specific_ptr<nplm::neuralTM> lm_head_backend_;
|
||||
|
||||
nplm::neuralTM* lm_label_base_instance_;
|
||||
mutable boost::thread_specific_ptr<nplm::neuralTM> lm_label_backend_;
|
||||
|
||||
std::string dummy_head;
|
||||
std::string m_glueSymbol;
|
||||
std::string m_startSymbol;
|
||||
std::string m_endSymbol;
|
||||
std::string m_endTag;
|
||||
std::string m_path_head_lm;
|
||||
std::string m_path_label_lm;
|
||||
bool m_isPTKVZ;
|
||||
bool m_isPretermBackoff;
|
||||
size_t m_context_left;
|
||||
size_t m_context_right;
|
||||
size_t m_context_up;
|
||||
bool m_premultiply;
|
||||
bool m_rerank;
|
||||
bool m_normalizeHeadLM;
|
||||
bool m_normalizeLabelLM;
|
||||
bool m_sharedVocab;
|
||||
std::string m_debugPath; // score all trees in the provided file, then exit
|
||||
int m_binarized;
|
||||
int m_cacheSize;
|
||||
|
||||
size_t offset_up_head;
|
||||
size_t offset_up_label;
|
||||
|
||||
size_t size_head;
|
||||
size_t size_label;
|
||||
std::vector<int> static_label_null;
|
||||
std::vector<int> static_head_null;
|
||||
int static_dummy_head;
|
||||
int static_start_head;
|
||||
int static_start_label;
|
||||
int static_stop_head;
|
||||
int static_stop_label;
|
||||
int static_head_head;
|
||||
int static_head_label;
|
||||
int static_root_head;
|
||||
int static_root_label;
|
||||
|
||||
int static_head_label_output;
|
||||
int static_stop_label_output;
|
||||
int static_start_label_output;
|
||||
|
||||
public:
|
||||
RDLM(const std::string &line)
|
||||
: StatefulFeatureFunction(2, line)
|
||||
, dummy_head("<dummy_head>")
|
||||
, m_glueSymbol("Q")
|
||||
, m_startSymbol("SSTART")
|
||||
, m_endSymbol("SEND")
|
||||
, m_endTag("</s>")
|
||||
, m_isPTKVZ(false)
|
||||
, m_isPretermBackoff(true)
|
||||
, m_context_left(3)
|
||||
, m_context_right(0)
|
||||
, m_context_up(2)
|
||||
, m_premultiply(true)
|
||||
, m_rerank(false)
|
||||
, m_normalizeHeadLM(false)
|
||||
, m_normalizeLabelLM(false)
|
||||
, m_sharedVocab(false)
|
||||
, m_binarized(0)
|
||||
, m_cacheSize(1000000)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
~RDLM();
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
||||
return new RDLMState(TreePointer(), 0, 0, 0);
|
||||
}
|
||||
|
||||
void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float,4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const;
|
||||
InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree * head_ptr=NULL) const;
|
||||
void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const;
|
||||
void GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const;
|
||||
void ScoreFile(std::string &path); //for debugging
|
||||
void PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const; //for debugging
|
||||
|
||||
TreePointerMap AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const;
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const {};
|
||||
void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const {};
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const {};
|
||||
FFState* EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const {UTIL_THROW(util::Exception, "Not implemented");};
|
||||
FFState* EvaluateWhenApplied(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
void Load();
|
||||
|
||||
// Iterator-class that yields all children of a node; if child is virtual node of binarized tree, its children are yielded instead.
|
||||
class UnbinarizedChildren
|
||||
{
|
||||
private:
|
||||
std::vector<TreePointer>::const_iterator iter;
|
||||
std::vector<TreePointer>::const_iterator _begin;
|
||||
std::vector<TreePointer>::const_iterator _end;
|
||||
InternalTree* current;
|
||||
const TreePointerMap & back_pointers;
|
||||
bool binarized;
|
||||
std::vector<std::pair<InternalTree*,std::vector<TreePointer>::const_iterator> > stack;
|
||||
|
||||
public:
|
||||
UnbinarizedChildren(InternalTree* root, const TreePointerMap & pointers, bool binary):
|
||||
current(root),
|
||||
back_pointers(pointers),
|
||||
binarized(binary)
|
||||
{
|
||||
stack.reserve(10);
|
||||
_end = current->GetChildren().end();
|
||||
iter = current->GetChildren().begin();
|
||||
// expand virtual node
|
||||
while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
|
||||
stack.push_back(std::make_pair(current, iter));
|
||||
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
|
||||
if ((*iter)->IsLeafNT()) {
|
||||
current = back_pointers.find(iter->get())->second.get();
|
||||
}
|
||||
else {
|
||||
current = iter->get();
|
||||
}
|
||||
iter = current->GetChildren().begin();
|
||||
}
|
||||
_begin = iter;
|
||||
}
|
||||
|
||||
std::vector<TreePointer>::const_iterator begin() const { return _begin; }
|
||||
std::vector<TreePointer>::const_iterator end() const { return _end; }
|
||||
|
||||
std::vector<TreePointer>::const_iterator operator++() {
|
||||
iter++;
|
||||
if (iter == current->GetChildren().end()) {
|
||||
while (!stack.empty()) {
|
||||
std::pair<InternalTree*,std::vector<TreePointer>::const_iterator> & active = stack.back();
|
||||
current = active.first;
|
||||
iter = ++active.second;
|
||||
stack.pop_back();
|
||||
if (iter != current->GetChildren().end()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (iter == _end) {
|
||||
return iter;
|
||||
}
|
||||
}
|
||||
// expand virtual node
|
||||
while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
|
||||
stack.push_back(std::make_pair(current, iter));
|
||||
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
|
||||
if ((*iter)->IsLeafNT()) {
|
||||
current = back_pointers.find(iter->get())->second.get();
|
||||
}
|
||||
else {
|
||||
current = iter->get();
|
||||
}
|
||||
iter = current->GetChildren().begin();
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
}
|
21
scripts/ems/support/tree-converter-wrapper.perl
Executable file
21
scripts/ems/support/tree-converter-wrapper.perl
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use utf8;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
Getopt::Long::config("pass_through");
|
||||
|
||||
my ($BIN,$MODEL);
|
||||
|
||||
&GetOptions('bin=s' => \$BIN,
|
||||
'model=s' => \$MODEL); # À la truecase.perl
|
||||
|
||||
die("ERROR: specify at least --bin BIN!") unless defined($BIN);
|
||||
|
||||
my $cmd = "$BIN";
|
||||
$cmd .= " -case true:model=$MODEL" if defined($MODEL);
|
||||
$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass other args to $BIN.
|
||||
|
||||
system $cmd;
|
@ -1,7 +0,0 @@
|
||||
#!/usr/bin/env sh
|
||||
|
||||
$1 \
|
||||
-input_format egret \
|
||||
-output_format egret \
|
||||
-no_egret_weight_normalization \
|
||||
-case true:model=$3
|
49
scripts/training/rdlm/README
Normal file
49
scripts/training/rdlm/README
Normal file
@ -0,0 +1,49 @@
|
||||
RDLM: relational dependency language model
|
||||
------------------------------------------
|
||||
|
||||
This is a language model for the string-to-tree decoder with a dependency grammar.
|
||||
It should work with any corpus with projective dependency annotation in ConLL format,
|
||||
converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
|
||||
It depends on NPLM for neural network training and querying.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Install NPLM and compile moses with it. See the instructions in the Moses documentation for details:
|
||||
|
||||
http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel
|
||||
|
||||
Training
|
||||
--------
|
||||
|
||||
RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
|
||||
If you have such a system, you can train RDLM on the target side of the same parallel corpus
|
||||
that is used for training the translation model.
|
||||
|
||||
To train the model on additional monolingual data, or test it on some held-out test/dev data,
|
||||
parse and process it in the same way that the parallel corpus has been processed.
|
||||
This includes tokenization, parsing, truecasing, compound splitting etc.
|
||||
|
||||
RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
|
||||
set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
|
||||
|
||||
./train_model_head.sh rdlm_head.nnlm working_dir_head
|
||||
./train_model_label.sh rdlm_label.nnlm working_dir_label
|
||||
|
||||
|
||||
Decoding
|
||||
--------
|
||||
|
||||
To use RDLM during decoding, add the following line to your moses.ini config:
|
||||
|
||||
[feature]
|
||||
RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
|
||||
|
||||
[weight]
|
||||
RDLM 0.1 0.1
|
||||
|
||||
Reference
|
||||
---------
|
||||
|
||||
Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation.
|
||||
Transactions of the Association for Computational Linguistics.
|
45
scripts/training/rdlm/average_null_embedding.py
Executable file
45
scripts/training/rdlm/average_null_embedding.py
Executable file
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# average embeddings of special null words for RDLM.
|
||||
# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy
|
||||
|
||||
def load_model(model_file):
|
||||
return nplm.NeuralLM.from_file(model_file)
|
||||
|
||||
def get_weights(path, vocab, len_context):
|
||||
d = [[0]*vocab for i in range(len_context)]
|
||||
for line in open(path):
|
||||
for i, word in enumerate(line.split()[:-1]):
|
||||
d[i][int(word)] += 1
|
||||
return d
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
nplm_path = sys.argv[1]
|
||||
model_input = sys.argv[2]
|
||||
training_instances = sys.argv[3]
|
||||
model_output = sys.argv[4]
|
||||
|
||||
sys.path.append(os.path.join(nplm_path,'python'))
|
||||
import nplm
|
||||
|
||||
model = load_model(model_input)
|
||||
|
||||
len_context = len(open(training_instances).readline().split())-1
|
||||
|
||||
sys.stderr.write('reading ngrams...')
|
||||
weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
|
||||
sys.stderr.write('done\n')
|
||||
|
||||
for i in range(len_context):
|
||||
index = model.word_to_index_input['<null_{0}>'.format(i)]
|
||||
model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
|
||||
sys.stderr.write('writing model...')
|
||||
model.to_file(open(model_output,'w'))
|
||||
sys.stderr.write('done\n')
|
262
scripts/training/rdlm/extract_syntactic_ngrams.py
Executable file
262
scripts/training/rdlm/extract_syntactic_ngrams.py
Executable file
@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
|
||||
# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
|
||||
# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
|
||||
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import sys
|
||||
import codecs
|
||||
import io
|
||||
import argparse
|
||||
|
||||
try:
|
||||
from lxml import etree as ET
|
||||
except ImportError:
|
||||
from xml.etree import cElementTree as ET
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
|
||||
|
||||
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
|
||||
choices=['label', 'head'], required=True)
|
||||
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
|
||||
help='input layer vocabulary file (one item per line; first line \'<unk>\')')
|
||||
parser.add_argument('--output_vocab', metavar='PATH', type=str,
|
||||
help='output layer vocabulary file (default: use input layer vocabulary)')
|
||||
parser.add_argument('--left_context', metavar='INT', type=int,
|
||||
help='size of context vector for left siblings (default: %(default)s)', default=3)
|
||||
parser.add_argument('--right_context', metavar='INT', type=int,
|
||||
help='size of context vector for right siblings (default: %(default)s)', default=0)
|
||||
parser.add_argument('--up_context', metavar='INT', type=int,
|
||||
help='size of context vector for ancestors (default: %(default)s)', default=2)
|
||||
parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
|
||||
help='glue symbol. Will be skipped during extraction (default: %(default)s)')
|
||||
parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
|
||||
help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
|
||||
parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
|
||||
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
|
||||
parser.add_argument('--ptkvz', action='store_true',
|
||||
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
|
||||
return parser.parse_args()
|
||||
|
||||
def escape_text(s):
|
||||
|
||||
s = s.replace('|','|') # factor separator
|
||||
s = s.replace('[','[') # syntax non-terminal
|
||||
s = s.replace(']',']') # syntax non-terminal
|
||||
s = s.replace('\'',''') # xml special character
|
||||
s = s.replace('"','"') # xml special character
|
||||
return s
|
||||
|
||||
# deterministic heuristic to get head of subtree
|
||||
def get_head(xml, add_ptkvz):
|
||||
head = None
|
||||
preterminal = None
|
||||
for child in xml:
|
||||
if not len(child):
|
||||
if head is not None:
|
||||
continue
|
||||
preterminal = child.get('label')
|
||||
head = escape_text(child.text.strip())
|
||||
|
||||
elif add_ptkvz and head and child.get('label') == 'avz':
|
||||
for grandchild in child:
|
||||
if grandchild.get('label') == 'PTKVZ':
|
||||
head = escape_text(grandchild.text.strip()) + head
|
||||
break
|
||||
|
||||
return head, preterminal
|
||||
|
||||
def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
|
||||
|
||||
if len(xml):
|
||||
|
||||
# skip glue rules
|
||||
if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
|
||||
for child in xml:
|
||||
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
|
||||
return
|
||||
|
||||
# skip virtual nodes
|
||||
if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
|
||||
return
|
||||
|
||||
if not parent_heads:
|
||||
parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
|
||||
parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
|
||||
|
||||
head, preterminal = get_head(xml, options.ptkvz)
|
||||
if not head:
|
||||
head = '<dummy_head>'
|
||||
preterminal = head
|
||||
elif head not in vocab:
|
||||
head = preterminal
|
||||
|
||||
label = xml.get('label')
|
||||
|
||||
# syntactic n-gram for root node
|
||||
int_list = []
|
||||
int_list.extend([start_head_idx] * options.left_context)
|
||||
int_list.extend([start_label_idx] * options.left_context)
|
||||
int_list.extend([stop_head_idx] * options.right_context)
|
||||
int_list.extend([stop_label_idx] * options.right_context)
|
||||
int_list.extend(parent_heads)
|
||||
int_list.extend(parent_labels)
|
||||
|
||||
if options.mode == 'label':
|
||||
int_list.append(output_vocab.get(label, 0))
|
||||
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
|
||||
elif options.mode == 'head' and not head == '<dummy_head>':
|
||||
int_list.append(vocab.get(label, 0))
|
||||
int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
|
||||
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
|
||||
|
||||
parent_heads.append(vocab.get(head, 0))
|
||||
parent_labels.append(vocab.get(label, 0))
|
||||
|
||||
# virtual start/end-of-subtree tag
|
||||
if len(xml) > 0:
|
||||
if options.right_context:
|
||||
start = ET.Element('tree')
|
||||
start2 = ET.Element('tree')
|
||||
start.set('label','<start_label>')
|
||||
start2.set('label','XY')
|
||||
start2.text = '<start_head>'
|
||||
start.append(start2)
|
||||
xml.insert(0,start)
|
||||
if options.left_context:
|
||||
end = ET.Element('tree')
|
||||
end2 = ET.Element('tree')
|
||||
end.set('label','<stop_label>')
|
||||
end2.set('label','XY')
|
||||
end2.text = '<stop_head>'
|
||||
end.append(end2)
|
||||
xml.append(end)
|
||||
|
||||
|
||||
heads = []
|
||||
preterminals = []
|
||||
labels = []
|
||||
|
||||
for child in xml:
|
||||
if not len(child):
|
||||
# mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
|
||||
head_child = '<head_head>'
|
||||
preterminal_child = head_child
|
||||
child_label = '<head_label>'
|
||||
else:
|
||||
head_child, preterminal_child = get_head(child, options.ptkvz)
|
||||
child_label = child.get('label')
|
||||
|
||||
if head_child is None:
|
||||
head_child = '<dummy_head>'
|
||||
|
||||
heads.append(head_child)
|
||||
preterminals.append(preterminal_child)
|
||||
labels.append(child_label)
|
||||
|
||||
heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
|
||||
labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
|
||||
|
||||
#ancestor context is same for all children
|
||||
up_heads = parent_heads[-options.up_context:]
|
||||
up_labels = parent_labels[-options.up_context:]
|
||||
|
||||
for i,child in enumerate(xml):
|
||||
|
||||
# skip some special symbols, but recursively extract n-grams for its children
|
||||
if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
|
||||
parent_heads.append(vocab.get(heads[i], 0))
|
||||
parent_labels.append(vocab.get(labels[i], 0))
|
||||
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
|
||||
parent_heads.pop()
|
||||
parent_labels.pop()
|
||||
continue
|
||||
|
||||
previous_heads = heads_idx[max(0,i-options.left_context):i]
|
||||
previous_labels = labels_idx[max(0,i-options.left_context):i]
|
||||
|
||||
subsequent_heads = heads_idx[i+1:i+options.right_context+1]
|
||||
subsequent_labels = labels_idx[i+1:i+options.right_context+1]
|
||||
|
||||
if len(previous_heads) < options.left_context:
|
||||
previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
|
||||
previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
|
||||
|
||||
if len(subsequent_heads) < options.right_context:
|
||||
subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
|
||||
subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
|
||||
|
||||
int_list = []
|
||||
int_list.extend(previous_heads)
|
||||
int_list.extend(previous_labels)
|
||||
int_list.extend(subsequent_heads)
|
||||
int_list.extend(subsequent_labels)
|
||||
int_list.extend(up_heads)
|
||||
int_list.extend(up_labels)
|
||||
if options.mode == 'label':
|
||||
int_list.append(output_vocab.get(labels[i], 0))
|
||||
elif options.mode == 'head':
|
||||
int_list.append(vocab.get(labels[i], 0))
|
||||
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
|
||||
|
||||
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
|
||||
|
||||
parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
|
||||
parent_labels.append(vocab.get(labels[i], 0))
|
||||
|
||||
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
|
||||
|
||||
parent_heads.pop()
|
||||
parent_labels.pop()
|
||||
|
||||
|
||||
def load_vocab(path):
|
||||
v = {}
|
||||
for i,line in enumerate(io.open(path, encoding="UTF-8")):
|
||||
v[line.strip()] = i
|
||||
return v
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||||
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||||
|
||||
options = parse_arguments()
|
||||
|
||||
vocab = load_vocab(options.vocab)
|
||||
|
||||
if options.output_vocab is None:
|
||||
sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
|
||||
output_vocab = vocab
|
||||
else:
|
||||
output_vocab = load_vocab(options.output_vocab)
|
||||
|
||||
start_head_idx = vocab.get("<start_head>", 0)
|
||||
start_label_idx = vocab.get("<start_label>", 0)
|
||||
stop_head_idx = vocab.get("<stop_head>", 0)
|
||||
stop_label_idx = vocab.get("<stop_label>", 0)
|
||||
|
||||
i = 0
|
||||
for line in sys.stdin:
|
||||
if i and not i % 50000:
|
||||
sys.stderr.write('.')
|
||||
if i and not i % 1000000:
|
||||
sys.stderr.write('{0}\n'.format(i))
|
||||
if sys.version_info < (3, 0):
|
||||
if line == b'\n':
|
||||
continue
|
||||
# hack for older moses versions with inconsistent encoding of "|"
|
||||
line = line.replace(b'&bar;', b'|')
|
||||
else:
|
||||
if line == '\n':
|
||||
continue
|
||||
# hack for older moses versions with inconsistent encoding of "|"
|
||||
line = line.replace('&bar;', '|')
|
||||
xml = ET.fromstring(line)
|
||||
get_syntactic_ngrams(xml, options, vocab, output_vocab)
|
||||
i += 1
|
169
scripts/training/rdlm/extract_vocab.py
Executable file
169
scripts/training/rdlm/extract_vocab.py
Executable file
@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# extract 5 vocabulary files from parsed corpus in moses XML format
|
||||
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import sys
|
||||
import codecs
|
||||
import io
|
||||
import argparse
|
||||
from collections import Counter
|
||||
|
||||
try:
|
||||
from lxml import etree as ET
|
||||
except ImportError:
|
||||
from xml.etree import cElementTree as ET
|
||||
|
||||
def parse_arguments():
|
||||
|
||||
help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
|
||||
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
|
||||
help_text += " [PREFIX].preterminals: preterminal symbols\n";
|
||||
help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
|
||||
help_text += " [PREFIX].terminals: terminal symbols\n";
|
||||
help_text += " [PREFIX].all: all of the above\n"
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
|
||||
|
||||
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
|
||||
help='input text (default: standard input).')
|
||||
parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
|
||||
help='output prefix (default: "vocab")')
|
||||
parser.add_argument('--ptkvz', action="store_true",
|
||||
help='special rule for German dependency trees: attach separable verb prefixes to verb')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def escape_text(s):
|
||||
|
||||
s = s.replace('|','|') # factor separator
|
||||
s = s.replace('[','[') # syntax non-terminal
|
||||
s = s.replace(']',']') # syntax non-terminal
|
||||
s = s.replace('\'',''') # xml special character
|
||||
s = s.replace('"','"') # xml special character
|
||||
return s
|
||||
|
||||
# deterministic heuristic to get head of subtree
|
||||
def get_head(xml):
|
||||
head = None
|
||||
preterminal = None
|
||||
for child in xml:
|
||||
if not len(child):
|
||||
if head is not None:
|
||||
continue
|
||||
preterminal = child.get('label')
|
||||
head = escape_text(child.text.strip())
|
||||
|
||||
# hack for split compounds
|
||||
elif child[-1].get('label') == 'SEGMENT':
|
||||
return escape_text(child[-1].text.strip()), 'SEGMENT'
|
||||
|
||||
elif args.ptkvz and head and child.get('label') == 'avz':
|
||||
for grandchild in child:
|
||||
if grandchild.get('label') == 'PTKVZ':
|
||||
head = escape_text(grandchild.text.strip()) + head
|
||||
break
|
||||
|
||||
return head, preterminal
|
||||
|
||||
def get_vocab(xml):
|
||||
|
||||
if len(xml):
|
||||
|
||||
head, preterminal = get_head(xml)
|
||||
if not head:
|
||||
head = '<null>'
|
||||
preterminal = '<null>'
|
||||
|
||||
heads[head] += 1
|
||||
preterminals[preterminal] += 1
|
||||
|
||||
label = xml.get('label')
|
||||
|
||||
nonterminals[label] += 1
|
||||
|
||||
for child in xml:
|
||||
if not len(child):
|
||||
continue
|
||||
get_vocab(child)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||||
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||||
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||||
|
||||
args = parse_arguments()
|
||||
|
||||
heads = Counter()
|
||||
preterminals = Counter()
|
||||
nonterminals = Counter()
|
||||
|
||||
i = 0
|
||||
for line in args.input:
|
||||
if i and not i % 50000:
|
||||
sys.stderr.write('.')
|
||||
if i and not i % 1000000:
|
||||
sys.stderr.write('{0}\n'.format(i))
|
||||
if line == '\n':
|
||||
continue
|
||||
|
||||
# hack for older moses versions with inconsistent encoding of "|"
|
||||
line = line.replace('&bar;', '|')
|
||||
|
||||
xml = ET.fromstring(line)
|
||||
get_vocab(xml)
|
||||
i += 1
|
||||
|
||||
special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
|
||||
|
||||
for i in range(30):
|
||||
special_tokens.append('<null_{0}>'.format(i))
|
||||
|
||||
f = io.open(args.output + '.special', 'w', encoding='UTF-8')
|
||||
for item in special_tokens:
|
||||
f.write(item + '\n')
|
||||
f.close()
|
||||
|
||||
f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
|
||||
for item in sorted(preterminals, key=preterminals.get, reverse=True):
|
||||
f.write(item + '\n')
|
||||
f.close()
|
||||
|
||||
f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
|
||||
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
|
||||
f.write(item + '\n')
|
||||
f.close()
|
||||
|
||||
f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
|
||||
for item in sorted(heads, key=heads.get, reverse=True):
|
||||
f.write(item + '\n')
|
||||
f.close()
|
||||
|
||||
f = io.open(args.output + '.all', 'w', encoding='UTF-8')
|
||||
special_tokens_set = set(special_tokens)
|
||||
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
|
||||
if item not in special_tokens:
|
||||
special_tokens.append(item)
|
||||
special_tokens_set.add(item)
|
||||
for item in sorted(preterminals, key=preterminals.get, reverse=True):
|
||||
if item not in special_tokens:
|
||||
special_tokens.append(item)
|
||||
special_tokens_set.add(item)
|
||||
for item in special_tokens:
|
||||
f.write(item + '\n')
|
||||
i = len(special_tokens)
|
||||
|
||||
for item in sorted(heads, key=heads.get, reverse=True):
|
||||
if item in special_tokens_set:
|
||||
continue
|
||||
i += 1
|
||||
f.write(item + '\n')
|
||||
f.close()
|
65
scripts/training/rdlm/train_model_head.sh
Executable file
65
scripts/training/rdlm/train_model_head.sh
Executable file
@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -eq 2 ]; then
|
||||
OUTFILE=$1
|
||||
WORKDIR=$2
|
||||
else
|
||||
echo "usage: $0 <outfile> <working_directory>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NPLM=/path/to/nplm
|
||||
MOSES_ROOT=/path/to/mosesdecoder
|
||||
|
||||
INFILE=/path/to/file/in/moses/xml/format
|
||||
VALIDATIONFILE=/path/to/file/in/moses/xml/format
|
||||
#TESTFILE1=/path/to/file/in/moses/xml/format
|
||||
#TESTFILE2=/path/to/file/in/moses/xml/format
|
||||
PREFIX=$(basename $OUTFILE)
|
||||
|
||||
EPOCHS=2
|
||||
INPUT_VOCAB_SIZE=500000
|
||||
OUTPUT_VOCAB_SIZE=500000
|
||||
MINIBATCH_SIZE=1000
|
||||
NOISE=100
|
||||
HIDDEN=0
|
||||
INPUT_EMBEDDING=150
|
||||
OUTPUT_EMBEDDING=750
|
||||
THREADS=4
|
||||
MODE=head
|
||||
UP_CONTEXT=2
|
||||
LEFT_CONTEXT=3
|
||||
RIGHT_CONTEXT=0
|
||||
|
||||
|
||||
mkdir -p $WORKDIR
|
||||
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
|
||||
|
||||
head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
|
||||
head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output
|
||||
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
|
||||
|
||||
$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
|
||||
--num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
|
||||
--input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
|
||||
--learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
|
||||
--input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
|
||||
|
||||
python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
|
||||
|
||||
if [[ $TESTFILE1 ]]; then
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
|
||||
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
|
||||
fi
|
||||
|
||||
if [[ $TESTFILE2 ]]; then
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
|
||||
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
|
||||
fi
|
72
scripts/training/rdlm/train_model_label.sh
Executable file
72
scripts/training/rdlm/train_model_label.sh
Executable file
@ -0,0 +1,72 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -eq 2 ]; then
|
||||
OUTFILE=$1
|
||||
WORKDIR=$2
|
||||
else
|
||||
echo "usage: $0 <outfile> <working_directory>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NPLM=/path/to/nplm
|
||||
MOSES_ROOT=/path/to/mosesdecoder
|
||||
|
||||
INFILE=/path/to/file/in/moses/xml/format
|
||||
VALIDATIONFILE=/path/to/file/in/moses/xml/format
|
||||
#TESTFILE1=/path/to/file/in/moses/xml/format
|
||||
#TESTFILE2=/path/to/file/in/moses/xml/format
|
||||
PREFIX=$(basename $OUTFILE)
|
||||
|
||||
EPOCHS=1
|
||||
INPUT_VOCAB_SIZE=500000
|
||||
OUTPUT_VOCAB_SIZE=75
|
||||
MINIBATCH_SIZE=1000
|
||||
NOISE=50
|
||||
HIDDEN=0
|
||||
INPUT_EMBEDDING=150
|
||||
OUTPUT_EMBEDDING=750
|
||||
THREADS=4
|
||||
MODE=label
|
||||
UP_CONTEXT=2
|
||||
LEFT_CONTEXT=3
|
||||
RIGHT_CONTEXT=0
|
||||
|
||||
|
||||
mkdir -p $WORKDIR
|
||||
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
|
||||
|
||||
head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
|
||||
cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals |
|
||||
grep -v "^<null" |
|
||||
grep -v "^<root" |
|
||||
grep -v "^<start_head" |
|
||||
grep -v "^<dummy" |
|
||||
grep -v "^<head_head" |
|
||||
grep -v "^<stop_head" |
|
||||
head -n $OUTPUT_VOCAB_SIZE > $WORKDIR/vocab.output
|
||||
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
|
||||
|
||||
$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
|
||||
--num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
|
||||
--input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
|
||||
--learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
|
||||
--input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
|
||||
|
||||
python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
|
||||
|
||||
if [[ $TESTFILE1 ]]; then
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
|
||||
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
|
||||
fi
|
||||
|
||||
if [[ $TESTFILE2 ]]; then
|
||||
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
|
||||
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
|
||||
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
|
||||
fi
|
@ -2153,8 +2153,8 @@ sub create_ini {
|
||||
|
||||
# SyntaxInputWeight FF
|
||||
if ($_USE_SYNTAX_INPUT_WEIGHT_FEATURE) {
|
||||
$feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight$i\n";
|
||||
$weight_spec .= "SyntaxInputWeight$i= 0.1\n";
|
||||
$feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight0\n";
|
||||
$weight_spec .= "SyntaxInputWeight0= 0.1\n";
|
||||
}
|
||||
|
||||
# generation model
|
||||
|
Loading…
Reference in New Issue
Block a user