This commit is contained in:
Ulrich Germann 2015-03-18 21:04:18 +00:00
commit 6f52340c2b
18 changed files with 1897 additions and 76 deletions

View File

@ -168,9 +168,10 @@ TreePointer ChartKBestExtractor::GetOutputTree(const Derivation &d)
}
mytree->Combine(previous_trees);
mytree->Unbinarize();
return mytree;
} else {
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
UTIL_THROW2("Error: k-best tree output active, but no internal tree structure found");
}
}

View File

@ -114,6 +114,7 @@
#ifdef LM_NEURAL
#include "moses/LM/NeuralLMWrapper.h"
#include "moses/LM/RDLM.h"
#include "moses/LM/bilingual-lm/BiLM_NPLM.h"
#endif
@ -296,6 +297,7 @@ FeatureRegistry::FeatureRegistry()
#endif
#ifdef LM_NEURAL
MOSES_FNAME2("NeuralLM", NeuralLMWrapper);
MOSES_FNAME(RDLM);
MOSES_FNAME2("BilingualNPLM", BilingualLM_NPLM);
#endif
#ifdef LM_DALM

View File

@ -115,6 +115,44 @@ void InternalTree::Combine(const std::vector<TreePointer> &previous)
}
}
//take tree with virtual nodes (created with relax-parse --RightBinarize or --LeftBinarize) and reconstruct original tree.
void InternalTree::Unbinarize()
{
// nodes with virtual label cannot be unbinarized
if (m_value.empty() || m_value[0] == '^') {
return;
}
//if node has child that is virtual node, get unbinarized list of children
for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') {
std::vector<TreePointer> new_children;
GetUnbinarizedChildren(new_children);
m_children = new_children;
break;
}
}
//recursion
for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
(*it)->Unbinarize();
}
}
//get the children of a node in a binarized tree; if a child is virtual, (transitively) replace it with its children
void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
{
for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) {
const std::string &label = (*itx)->GetLabel();
if (!label.empty() && label[0] == '^') {
(*itx)->GetUnbinarizedChildren(ret);
}
else {
ret.push_back(*itx);
}
}
}
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
{

View File

@ -38,6 +38,8 @@ public:
std::string GetString(bool start = true) const;
void Combine(const std::vector<TreePointer> &previous);
void Unbinarize();
void GetUnbinarizedChildren(std::vector<TreePointer> &children) const;
const std::string & GetLabel() const {
return m_value;
}
@ -93,6 +95,68 @@ public:
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// Python-like generator that yields next nonterminal leaf on every call
$generator(leafNT)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
leafNT(InternalTree* root = 0): tree(root) {}
$emit(std::vector<TreePointer>::iterator)
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
$yield(it);
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) { // normal pointer to same object that TreePointer points to
$restart(tree = (*it).get());
}
}
}
$stop;
};
// Python-like generator that yields the parent of the next nonterminal leaf on every call
$generator(leafNTParent)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
leafNTParent(InternalTree* root = 0): tree(root) {}
$emit(InternalTree*)
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
$yield(tree);
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) {
$restart(tree = (*it).get());
}
}
}
$stop;
};
// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
$generator(leafNTPath)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
std::vector<InternalTree*> * path;
leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
$emit(std::vector<TreePointer>::iterator)
path->push_back(tree);
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
path->push_back((*it).get());
$yield(it);
path->pop_back();
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) {
$restart(tree = (*it).get());
}
}
}
path->pop_back();
$stop;
};
};
@ -113,68 +177,4 @@ public:
};
};
// Python-like generator that yields next nonterminal leaf on every call
$generator(leafNT)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
leafNT(InternalTree* root = 0): tree(root) {}
$emit(std::vector<TreePointer>::iterator)
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
$yield(it);
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) { // normal pointer to same object that TreePointer points to
$restart(tree = (*it).get());
}
}
}
$stop;
};
// Python-like generator that yields the parent of the next nonterminal leaf on every call
$generator(leafNTParent)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
leafNTParent(InternalTree* root = 0): tree(root) {}
$emit(InternalTree*)
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
$yield(tree);
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) {
$restart(tree = (*it).get());
}
}
}
$stop;
};
// Python-like generator that yields the next nonterminal leaf on every call, and also stores the path from the root of the tree to the nonterminal
$generator(leafNTPath)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
std::vector<InternalTree*> * path;
leafNTPath(InternalTree* root = NULL, std::vector<InternalTree*> * orig = NULL): tree(root), path(orig) {}
$emit(std::vector<TreePointer>::iterator)
path->push_back(tree);
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
path->push_back((*it).get());
$yield(it);
path->pop_back();
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) {
$restart(tree = (*it).get());
}
}
}
path->pop_back();
$stop;
};
}

View File

@ -70,6 +70,11 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
}
mytree->Combine(previous_trees);
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "</s>" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "</s>"));
if (m_binarized && full_sentence) {
mytree->Unbinarize();
}
return new TreeState(mytree);
} else {
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
@ -77,4 +82,17 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
}
void TreeStructureFeature::SetParameter(const std::string& key, const std::string& value)
{
std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
if (key == "tuneable") {
m_tuneable = Scan<bool>(value);
} else if (key == "filterable") { //ignore
} else if (key == "binarized") { // if trees have been binarized before learning translation model; output unbinarized trees
m_binarized = true;
} else {
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
}
}
}

View File

@ -34,9 +34,11 @@ class TreeStructureFeature : public StatefulFeatureFunction
{
SyntaxConstraints* m_constraints;
LabelSet* m_labelset;
bool m_binarized;
public:
TreeStructureFeature(const std::string &line)
:StatefulFeatureFunction(0, line) {
:StatefulFeatureFunction(0, line)
, m_binarized(false) {
ReadParameters();
}
~TreeStructureFeature() {
@ -53,6 +55,8 @@ public:
return true;
}
void SetParameter(const std::string& key, const std::string& value);
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown

View File

@ -90,8 +90,13 @@ if $(with-nplm) {
lib nplm : : <search>$(with-nplm)/lib <search>$(with-nplm)/lib64 ;
obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
alias neural : NeuralLMWrapper.o BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
obj RDLM.o : RDLM.cpp nplm ..//headers : <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen ;
alias neural : NeuralLMWrapper.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
alias bilinguallm : BiLM_NPLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
alias rdlm : RDLM.o nplm : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>LM_NEURAL ;
dependencies += neural ;
dependencies += bilinguallm ;
dependencies += rdlm ;
lmmacros += LM_NEURAL ;
}

832
moses/LM/RDLM.cpp Normal file
View File

@ -0,0 +1,832 @@
#include "RDLM.h"
#include <vector>
#include "moses/StaticData.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/ChartHypothesis.h"
#include "moses/InputFileStream.h"
#include "moses/Util.h"
#include "util/exception.hh"
#include "neuralTM.h"
namespace Moses
{
typedef Eigen::Map<Eigen::Matrix<int,Eigen::Dynamic,1> > EigenMap;
RDLM::~RDLM() {
delete lm_head_base_instance_;
delete lm_label_base_instance_;
}
void RDLM::Load() {
lm_head_base_instance_ = new nplm::neuralTM();
lm_head_base_instance_->read(m_path_head_lm);
m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words();
// std::cerr << "Does head RDLM share vocabulary for input/output? " << m_sharedVocab << std::endl;
lm_label_base_instance_ = new nplm::neuralTM();
lm_label_base_instance_->read(m_path_label_lm);
if (m_premultiply) {
lm_head_base_instance_->premultiply();
lm_label_base_instance_->premultiply();
}
lm_head_base_instance_->set_cache(m_cacheSize);
lm_label_base_instance_->set_cache(m_cacheSize);
StaticData &staticData = StaticData::InstanceNonConst();
if (staticData.GetTreeStructure() == NULL) {
staticData.SetTreeStructure(this);
}
offset_up_head = 2*m_context_left + 2*m_context_right;
offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up;
size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2;
size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1;
UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(),
"Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head);
UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(),
"Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label);
//get int value of commonly used tokens
static_head_null.resize(size_head);
for (unsigned int i = 0; i < size_head; i++) {
char numstr[20];
sprintf(numstr, "<null_%d>", i);
static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr);
}
static_label_null.resize(size_label);
for (unsigned int i = 0; i < size_label; i++) {
char numstr[20];
sprintf(numstr, "<null_%d>", i);
static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
}
static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head);
static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
static_head_head = lm_head_base_instance_->lookup_input_word("<head_head>");
static_head_label = lm_head_base_instance_->lookup_input_word("<head_label>");
static_head_label_output = lm_label_base_instance_->lookup_output_word("<head_label>");
static_stop_head = lm_head_base_instance_->lookup_input_word("<stop_head>");
static_stop_label = lm_head_base_instance_->lookup_input_word("<stop_label>");
static_stop_label_output = lm_label_base_instance_->lookup_output_word("<stop_label>");
static_start_label_output = lm_label_base_instance_->lookup_output_word("<start_label>");
static_root_head = lm_head_base_instance_->lookup_input_word("<root_head>");
static_root_label = lm_head_base_instance_->lookup_input_word("<root_label>");
// just score provided file, then exit.
if (!m_debugPath.empty()) {
ScoreFile(m_debugPath);
exit(1);
}
// {
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA europäische]] [NN Zeit]]]"));
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
// TreePointer mytree4 (new InternalTree("[pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA]] [NN Zeit]]]"));
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred]]"));
//
// std::vector<int> ancestor_heads;
// std::vector<int> ancestor_labels;
//
// size_t boundary_hash(0);
// boost::array<float, 4> score;
// score.fill(0);
// std::cerr << "scoring: " << mytree3->GetString() << std::endl;
// std::vector<TreePointer> previous_trees;
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// previous_trees.push_back(mytree3);
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// mytree4->Combine(previous_trees);
// previous_trees.clear();
// previous_trees.push_back(mytree4);
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
//
// score[1] = 0;
// score[3] = 0;
// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// score[0] = 0;
// score[1] = 0;
// score[2] = 0;
// score[3] = 0;
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
//
// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// }
// UTIL_THROW2("Finished");
//
// }
//
// {
// std::cerr << "BINARIZED\n\n";
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA europäische]] [NN Zeit]]]]]]"));
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
// TreePointer mytree4 (new InternalTree("[^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA]] [NN Zeit]]]"));
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred]]]]"));
//
// std::vector<int> ancestor_heads;
// std::vector<int> ancestor_labels;
//
// size_t boundary_hash(0);
// boost::array<float, 4> score;
// score.fill(0);
// std::cerr << "scoring: " << mytree3->GetString() << std::endl;
// std::vector<TreePointer> previous_trees;
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
// Score(mytree3.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// previous_trees.push_back(mytree3);
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
// Score(mytree4.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// mytree4->Combine(previous_trees);
// previous_trees.clear();
// previous_trees.push_back(mytree4);
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
//
// score[1] = 0;
// score[3] = 0;
// Score(mytree2.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// score[0] = 0;
// score[1] = 0;
// score[2] = 0;
// score[3] = 0;
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
//
// Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
//
// }
// UTIL_THROW2("Finished");
}
void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float, 4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual, int rescoring_levels) const
{
// ignore terminal nodes
if (root->IsTerminal()) {
return;
}
// ignore glue rules
if (root->GetLabel() == m_glueSymbol) {
// recursion
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
{
Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels);
}
return;
}
// ignore virtual nodes (in binarization; except if it's the root)
if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) {
// recursion
if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
root = back_pointers.find(root)->second.get();
rescoring_levels = m_context_up-1;
}
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels);
}
return;
}
// ignore start/end of sentence tags
if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) {
return;
}
nplm::neuralTM *lm_head = lm_head_backend_.get();
if (!lm_head) {
lm_head = new nplm::neuralTM(*lm_head_base_instance_);
lm_head->set_normalization(m_normalizeHeadLM);
lm_head->set_cache(m_cacheSize);
lm_head_backend_.reset(lm_head);
}
// ignore preterminal node (except if we're scoring root nodes)
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
// root of tree: score without context
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
std::vector<int> ngram_head_null (static_head_null);
ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel());
if (m_isPretermBackoff && ngram_head_null.back() == 0) {
ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel());
}
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
std::vector<int>::iterator it = ngram_head_null.begin();
std::fill_n(it, m_context_left, static_start_head);
it += m_context_left;
std::fill_n(it, m_context_left, static_start_label);
it += m_context_left;
std::fill_n(it, m_context_right, static_stop_head);
it += m_context_right;
std::fill_n(it, m_context_right, static_stop_label);
it += m_context_right;
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
}
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
}
else {
boost::hash_combine(boundary_hash, ngram_head_null.back());
score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
}
}
return;
// we only need to re-visit previous hypotheses if we have more context available.
} else if (root->IsLeafNT()) {
if (m_context_up > 1 && ancestor_heads.size()) {
root = back_pointers.find(root)->second.get();
// ignore preterminal node
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
return;
}
rescoring_levels = m_context_up-1;
}
else {
return;
}
}
nplm::neuralTM *lm_label = lm_label_backend_.get();
if (!lm_label) {
lm_label = new nplm::neuralTM(*lm_label_base_instance_);
lm_label->set_normalization(m_normalizeLabelLM);
lm_label->set_cache(m_cacheSize);
lm_label_backend_.reset(lm_label);
}
std::pair<int,int> head_ids;
InternalTree* found = GetHead(root, back_pointers, head_ids);
if (found == NULL) {
head_ids = std::make_pair(static_dummy_head, static_dummy_head);
}
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
const std::string & head_label = root->GetLabel();
bool virtual_head = false;
int reached_end = 0;
int label_idx, label_idx_out;
if (m_binarized && head_label[0] == '^') {
virtual_head = true;
if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) {
reached_end = 1; //indicate that we've seen the first symbol of the RHS
}
else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) {
reached_end = 2; // indicate that we've seen the last symbol of the RHS
}
// with 'full' binarization, direction is encoded in 2nd char
std::string clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1);
label_idx = lm_label->lookup_input_word(clipped_label);
label_idx_out = lm_label->lookup_output_word(clipped_label);
}
else {
reached_end = 3; // indicate that we've seen first and last symbol of the RHS
label_idx = lm_label->lookup_input_word(head_label);
label_idx_out = lm_label->lookup_output_word(head_label);
}
int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first;
// root of tree: score without context
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
if (head_idx != static_dummy_head && head_idx != static_head_head) {
std::vector<int> ngram_head_null (static_head_null);
*(ngram_head_null.end()-2) = label_idx;
ngram_head_null.back() = head_ids.second;
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
std::vector<int>::iterator it = ngram_head_null.begin();
std::fill_n(it, m_context_left, static_start_head);
it += m_context_left;
std::fill_n(it, m_context_left, static_start_label);
it += m_context_left;
std::fill_n(it, m_context_right, static_stop_head);
it += m_context_right;
std::fill_n(it, m_context_right, static_stop_label);
it += m_context_right;
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
}
else {
boost::hash_combine(boundary_hash, ngram_head_null.back());
score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram_head_null.data(), ngram_head_null.size())));
}
}
std::vector<int> ngram_label_null (static_label_null);
ngram_label_null.back() = label_idx_out;
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
std::vector<int>::iterator it = ngram_label_null.begin();
std::fill_n(it, m_context_left, static_start_head);
it += m_context_left;
std::fill_n(it, m_context_left, static_start_label);
it += m_context_left;
std::fill_n(it, m_context_right, static_stop_head);
it += m_context_right;
std::fill_n(it, m_context_right, static_stop_label);
it += m_context_right;
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size())));
}
else {
boost::hash_combine(boundary_hash, ngram_label_null.back());
score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram_label_null.data(), ngram_label_null.size())));
}
}
ancestor_heads.push_back(head_idx);
ancestor_labels.push_back(label_idx);
if (virtual_head) {
num_virtual = m_context_up;
}
else if (num_virtual) {
--num_virtual;
}
// fill ancestor context (same for all children)
if (context_up_nonempty < m_context_up) {
++context_up_nonempty;
}
size_t up_padding = m_context_up - context_up_nonempty;
std::vector<int> ngram (static_label_null);
std::vector<int>::iterator it = ngram.begin() + offset_up_head;
if (up_padding > 0) {
it += up_padding;
}
it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it);
if (up_padding > 0) {
it += up_padding;
}
it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it);
// create vectors of head/label IDs of all children
int num_children = root->GetLength();
// get number of children after unbinarization
if (m_binarized) {
num_children = 0;
UnbinarizedChildren real_children(root, back_pointers, m_binarized);
for (std::vector<TreePointer>::const_iterator it = real_children.begin(); it != real_children.end(); it = ++real_children) {
num_children++;
}
}
if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++; //also predict start label
if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++; //also predict end label
std::vector<int> heads(num_children);
std::vector<int> labels(num_children);
std::vector<int> heads_output(num_children);
std::vector<int> labels_output(num_children);
GetChildHeadsAndLabels(root, back_pointers, reached_end, lm_head, lm_label, heads, labels, heads_output, labels_output);
//left padding; only need to add this initially
if (reached_end == 1 || reached_end == 3) {
std::fill_n(ngram.begin(), m_context_left, static_start_head);
std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label);
}
size_t left_padding = m_context_left;
size_t left_offset = 0;
size_t right_offset = std::min(heads.size(), m_context_right + 1);
size_t right_padding = m_context_right + 1 - right_offset;
// construct context of label model and predict label
for (size_t i = 0; i != heads.size(); i++) {
std::vector<int>::iterator it = ngram.begin();
if (left_padding > 0) {
it += left_padding;
}
it = std::copy(heads.begin()+left_offset, heads.begin()+i, it);
if (left_padding > 0) {
it += left_padding;
}
it = std::copy(labels.begin()+left_offset, labels.begin()+i, it);
it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it);
if (right_padding > 0) {
if (reached_end == 2 || reached_end == 3) {
std::fill_n(it, right_padding, static_stop_head);
it += right_padding;
}
else {
std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it);
}
}
it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it);
if (right_padding > 0) {
if (reached_end == 2 || reached_end == 3) {
std::fill_n(it, right_padding, static_stop_label);
it += right_padding;
}
else {
std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it);
}
}
ngram.back() = labels_output[i];
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
score[2] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
}
else {
boost::hash_combine(boundary_hash, ngram.back());
score[3] += FloorScore(lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
}
// construct context of head model and predict head
if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) {
ngram.back() = labels[i];
ngram.push_back(heads_output[i]);
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
score[0] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
}
else {
boost::hash_combine(boundary_hash, ngram.back());
score[1] += FloorScore(lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
}
ngram.pop_back();
}
// next time, we need to add less start symbol padding
if (left_padding)
left_padding--;
else
left_offset++;
if (right_offset < heads.size())
right_offset++;
else
right_padding++;
}
if (rescoring_levels == 1) {
ancestor_heads.pop_back();
ancestor_labels.pop_back();
return;
}
// recursion
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
{
Score(it->get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash, num_virtual, rescoring_levels - 1);
}
ancestor_heads.pop_back();
ancestor_labels.pop_back();
}
InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree* head_ptr) const
{
InternalTree *tree;
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it)
{
if ((*it)->IsLeafNT()) {
tree = back_pointers.find(it->get())->second.get();
}
else {
tree = it->get();
}
if (m_binarized && tree->GetLabel()[0] == '^') {
head_ptr = GetHead(tree, back_pointers, IDs, head_ptr);
if (head_ptr != NULL && !m_isPTKVZ) {
return head_ptr;
}
}
// assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head
// if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned
else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) {
head_ptr = tree;
if (!m_isPTKVZ) {
GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
return head_ptr;
}
}
// add PTKVZ to lemma of verb
else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") {
InternalTree *tree2;
for (std::vector<TreePointer>::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) {
if ((*it2)->IsLeafNT()) {
tree2 = back_pointers.find(it2->get())->second.get();
}
else {
tree2 = it2->get();
}
if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) {
std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel();
GetIDs(verb, head_ptr->GetLabel(), IDs);
return head_ptr;
}
}
}
}
if (head_ptr != NULL) {
GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
}
return head_ptr;
}
void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const
{
std::pair<int,int> child_ids;
InternalTree* found;
size_t j = 0;
// score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
if (m_context_right && (reached_end == 1 || reached_end == 3)) {
heads[j] = static_start_head;
labels[j] = static_start_label;
labels_output[j] = static_start_label_output;
j++;
}
UnbinarizedChildren real_children(root, back_pointers, m_binarized);
// extract head words / labels
for (std::vector<TreePointer>::const_iterator itx = real_children.begin(); itx != real_children.end(); itx = ++real_children) {
if ((*itx)->IsTerminal()) {
std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl;
std::cerr << "children: ";
for (std::vector<TreePointer>::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) {
std::cerr << (*itx2)->GetLabel() << " ";
}
std::cerr << std::endl;
// resize vectors (should we throw exception instead?)
heads.pop_back();
labels.pop_back();
heads_output.pop_back();
labels_output.pop_back();
continue;
}
InternalTree* child = itx->get();
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
if ((*itx)->IsLeafNT()) {
child = back_pointers.find(itx->get())->second.get();
}
// preterminal node
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
heads[j] = static_head_head;
labels[j] = static_head_label;
labels_output[j] = static_head_label_output;
j++;
continue;
}
found = GetHead(child, back_pointers, child_ids);
if (found == NULL) {
child_ids = std::make_pair(static_dummy_head, static_dummy_head);
}
labels[j] = lm_head->lookup_input_word(child->GetLabel());
labels_output[j] = lm_label->lookup_output_word(child->GetLabel());
heads[j] = child_ids.first;
heads_output[j] = child_ids.second;
j++;
}
// score end label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
if (m_context_left && (reached_end == 2 || reached_end == 3)) {
heads[j] = static_stop_head;
labels[j] = static_stop_label;
labels_output[j] = static_stop_label_output;
}
}
void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const
{
IDs.first = lm_head_base_instance_->lookup_input_word(head);
if (m_isPretermBackoff && IDs.first == 0) {
IDs.first = lm_head_base_instance_->lookup_input_word(preterminal);
}
if (m_sharedVocab) {
IDs.second = IDs.first;
}
else {
IDs.second = lm_head_base_instance_->lookup_output_word(head);
if (m_isPretermBackoff && IDs.second == 0) {
IDs.second = lm_head_base_instance_->lookup_output_word(preterminal);
}
}
}
void RDLM::PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const
{
for (size_t i = 0; i < ngram.size()-1; i++) {
std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " ";
}
std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " ";
for (size_t i = 0; i < ngram.size(); i++) {
std::cerr << ngram[i] << " ";
}
std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl;
}
RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const
{
TreePointerMap ret;
std::vector<TreePointer>::iterator it;
bool found = false;
InternalTree::leafNT next_leafNT(root);
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
found = next_leafNT(it);
if (found) {
ret[it->get()] = *it_prev;
}
else {
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
}
}
return ret;
}
void RDLM::ScoreFile(std::string &path)
{
InputFileStream inStream(path);
std::string line, null;
std::vector<int> ancestor_heads(m_context_up, static_root_head);
std::vector<int> ancestor_labels(m_context_up, static_root_label);
while(getline(inStream, line)) {
TreePointerMap back_pointers;
boost::array<float, 4> score;
score.fill(0);
InternalTree* mytree (new InternalTree(line));
size_t boundary_hash = 0;
Score(mytree, back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl;
}
}
void RDLM::SetParameter(const std::string& key, const std::string& value)
{
std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
if (key == "tuneable") {
m_tuneable = Scan<bool>(value);
} else if (key == "filterable") { //ignore
} else if (key == "path_head_lm") {
m_path_head_lm = value;
} else if (key == "path_label_lm") {
m_path_label_lm = value;
} else if (key == "ptkvz") {
m_isPTKVZ = Scan<bool>(value);
} else if (key == "backoff") {
m_isPretermBackoff = Scan<bool>(value);
} else if (key == "context_up") {
m_context_up = Scan<size_t>(value);
} else if (key == "context_left") {
m_context_left = Scan<size_t>(value);
} else if (key == "context_right") {
m_context_right = Scan<size_t>(value);
} else if (key == "debug_path") {
m_debugPath = value;
} else if (key == "premultiply") {
m_premultiply = Scan<bool>(value);
} else if (key == "rerank") {
m_rerank = Scan<bool>(value);
} else if (key == "normalize_head_lm") {
m_normalizeHeadLM = Scan<bool>(value);
} else if (key == "normalize_label_lm") {
m_normalizeLabelLM = Scan<bool>(value);
} else if (key == "binarized") {
if (value == "left")
m_binarized = 1;
else if (value == "right")
m_binarized = 2;
else if (value == "full")
m_binarized = 3;
else
UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
} else if (key == "glue_symbol") {
m_glueSymbol = value;
} else if (key == "cache_size") {
m_cacheSize = Scan<int>(value);
} else {
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
}
}
FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
, int featureID /* used to index the state in the previous hypotheses */
, ScoreComponentCollection* accumulator) const
{
if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
const std::string *tree = property->GetValueString();
TreePointer mytree (boost::make_shared<InternalTree>(*tree));
//get subtrees (in target order)
std::vector<TreePointer> previous_trees;
float prev_approx_head = 0, prev_approx_label = 0; //approximated (due to lack of context) LM costs from previous hypos
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
if (word.IsNonTerminal()) {
size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
const RDLMState* prev = static_cast<const RDLMState*>(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID));
previous_trees.push_back(prev->GetTree());
prev_approx_head -= prev->GetApproximateScoreHead();
prev_approx_label -= prev->GetApproximateScoreLabel();
}
}
size_t ff_idx = accumulator->GetIndexes(this).first;
accumulator->PlusEquals(ff_idx, prev_approx_head);
accumulator->PlusEquals(ff_idx+1, prev_approx_label);
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag));
std::vector<int> ancestor_heads ((full_sentence ? m_context_up : 0), static_root_head);
std::vector<int> ancestor_labels ((full_sentence ? m_context_up : 0), static_root_label);
ancestor_heads.reserve(10);
ancestor_labels.reserve(10);
TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees);
boost::array<float, 4> score; // score_head, approx_score_head, score_label, approx_score_label
score.fill(0);
//hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning.
size_t boundary_hash = 0;
if (!m_rerank) {
Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
accumulator->PlusEquals(ff_idx, score[0] + score[1]);
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
}
mytree->Combine(previous_trees);
if (m_rerank && full_sentence) {
Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash);
accumulator->PlusEquals(ff_idx, score[0] + score[1]);
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
}
if (m_binarized && full_sentence) {
mytree->Unbinarize();
}
return new RDLMState(mytree, score[1], score[3], boundary_hash);
}
else {
UTIL_THROW2("Error: RDLM active, but no internal tree structure found");
}
}
}

245
moses/LM/RDLM.h Normal file
View File

@ -0,0 +1,245 @@
#include <string>
#include <map>
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/FFState.h"
#include "moses/FF/InternalTree.h"
#include <boost/thread/tss.hpp>
#include <boost/array.hpp>
// relational dependency language model, described in:
// Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. Transactions of the Association for Computational Linguistics.
// see 'scripts/training/rdlm' for training scripts
namespace nplm {
class neuralTM;
}
namespace Moses
{
class RDLMState : public TreeState
{
float m_approx_head; //score that was approximated due to lack of context
float m_approx_label;
size_t m_hash;
public:
RDLMState(TreePointer tree, float approx_head, float approx_label, size_t hash)
: TreeState(tree)
, m_approx_head(approx_head)
, m_approx_label(approx_label)
, m_hash(hash)
{}
float GetApproximateScoreHead() const {
return m_approx_head;
}
float GetApproximateScoreLabel() const {
return m_approx_label;
}
size_t GetHash() const {
return m_hash;
}
int Compare(const FFState& other) const {
if (m_hash == static_cast<const RDLMState*>(&other)->GetHash()) return 0;
else if (m_hash > static_cast<const RDLMState*>(&other)->GetHash()) return 1;
else return -1;
}
};
class RDLM : public StatefulFeatureFunction
{
typedef std::map<InternalTree*,TreePointer> TreePointerMap;
nplm::neuralTM* lm_head_base_instance_;
mutable boost::thread_specific_ptr<nplm::neuralTM> lm_head_backend_;
nplm::neuralTM* lm_label_base_instance_;
mutable boost::thread_specific_ptr<nplm::neuralTM> lm_label_backend_;
std::string dummy_head;
std::string m_glueSymbol;
std::string m_startSymbol;
std::string m_endSymbol;
std::string m_endTag;
std::string m_path_head_lm;
std::string m_path_label_lm;
bool m_isPTKVZ;
bool m_isPretermBackoff;
size_t m_context_left;
size_t m_context_right;
size_t m_context_up;
bool m_premultiply;
bool m_rerank;
bool m_normalizeHeadLM;
bool m_normalizeLabelLM;
bool m_sharedVocab;
std::string m_debugPath; // score all trees in the provided file, then exit
int m_binarized;
int m_cacheSize;
size_t offset_up_head;
size_t offset_up_label;
size_t size_head;
size_t size_label;
std::vector<int> static_label_null;
std::vector<int> static_head_null;
int static_dummy_head;
int static_start_head;
int static_start_label;
int static_stop_head;
int static_stop_label;
int static_head_head;
int static_head_label;
int static_root_head;
int static_root_label;
int static_head_label_output;
int static_stop_label_output;
int static_start_label_output;
public:
RDLM(const std::string &line)
: StatefulFeatureFunction(2, line)
, dummy_head("<dummy_head>")
, m_glueSymbol("Q")
, m_startSymbol("SSTART")
, m_endSymbol("SEND")
, m_endTag("</s>")
, m_isPTKVZ(false)
, m_isPretermBackoff(true)
, m_context_left(3)
, m_context_right(0)
, m_context_up(2)
, m_premultiply(true)
, m_rerank(false)
, m_normalizeHeadLM(false)
, m_normalizeLabelLM(false)
, m_sharedVocab(false)
, m_binarized(0)
, m_cacheSize(1000000)
{
ReadParameters();
}
~RDLM();
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
return new RDLMState(TreePointer(), 0, 0, 0);
}
void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float,4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const;
InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree * head_ptr=NULL) const;
void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const;
void GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const;
void ScoreFile(std::string &path); //for debugging
void PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const; //for debugging
TreePointerMap AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const;
bool IsUseable(const FactorMask &mask) const {
return true;
}
void SetParameter(const std::string& key, const std::string& value);
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const {};
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const {};
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const {};
FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const {UTIL_THROW(util::Exception, "Not implemented");};
FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
void Load();
// Iterator-class that yields all children of a node; if child is virtual node of binarized tree, its children are yielded instead.
class UnbinarizedChildren
{
private:
std::vector<TreePointer>::const_iterator iter;
std::vector<TreePointer>::const_iterator _begin;
std::vector<TreePointer>::const_iterator _end;
InternalTree* current;
const TreePointerMap & back_pointers;
bool binarized;
std::vector<std::pair<InternalTree*,std::vector<TreePointer>::const_iterator> > stack;
public:
UnbinarizedChildren(InternalTree* root, const TreePointerMap & pointers, bool binary):
current(root),
back_pointers(pointers),
binarized(binary)
{
stack.reserve(10);
_end = current->GetChildren().end();
iter = current->GetChildren().begin();
// expand virtual node
while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
stack.push_back(std::make_pair(current, iter));
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
if ((*iter)->IsLeafNT()) {
current = back_pointers.find(iter->get())->second.get();
}
else {
current = iter->get();
}
iter = current->GetChildren().begin();
}
_begin = iter;
}
std::vector<TreePointer>::const_iterator begin() const { return _begin; }
std::vector<TreePointer>::const_iterator end() const { return _end; }
std::vector<TreePointer>::const_iterator operator++() {
iter++;
if (iter == current->GetChildren().end()) {
while (!stack.empty()) {
std::pair<InternalTree*,std::vector<TreePointer>::const_iterator> & active = stack.back();
current = active.first;
iter = ++active.second;
stack.pop_back();
if (iter != current->GetChildren().end()) {
break;
}
}
if (iter == _end) {
return iter;
}
}
// expand virtual node
while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
stack.push_back(std::make_pair(current, iter));
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
if ((*iter)->IsLeafNT()) {
current = back_pointers.find(iter->get())->second.get();
}
else {
current = iter->get();
}
iter = current->GetChildren().begin();
}
return iter;
}
};
};
}

View File

@ -0,0 +1,21 @@
#!/usr/bin/env perl
use warnings;
use strict;
use utf8;
use Getopt::Long "GetOptions";
Getopt::Long::config("pass_through");
my ($BIN,$MODEL);
&GetOptions('bin=s' => \$BIN,
'model=s' => \$MODEL); # À la truecase.perl
die("ERROR: specify at least --bin BIN!") unless defined($BIN);
my $cmd = "$BIN";
$cmd .= " -case true:model=$MODEL" if defined($MODEL);
$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass other args to $BIN.
system $cmd;

View File

@ -1,7 +0,0 @@
#!/usr/bin/env sh
$1 \
-input_format egret \
-output_format egret \
-no_egret_weight_normalization \
-case true:model=$3

View File

@ -0,0 +1,49 @@
RDLM: relational dependency language model
------------------------------------------
This is a language model for the string-to-tree decoder with a dependency grammar.
It should work with any corpus with projective dependency annotation in ConLL format,
converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
It depends on NPLM for neural network training and querying.
Prerequisites
-------------
Install NPLM and compile moses with it. See the instructions in the Moses documentation for details:
http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel
Training
--------
RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
If you have such a system, you can train RDLM on the target side of the same parallel corpus
that is used for training the translation model.
To train the model on additional monolingual data, or test it on some held-out test/dev data,
parse and process it in the same way that the parallel corpus has been processed.
This includes tokenization, parsing, truecasing, compound splitting etc.
RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
./train_model_head.sh rdlm_head.nnlm working_dir_head
./train_model_label.sh rdlm_label.nnlm working_dir_label
Decoding
--------
To use RDLM during decoding, add the following line to your moses.ini config:
[feature]
RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
[weight]
RDLM 0.1 0.1
Reference
---------
Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation.
Transactions of the Association for Computational Linguistics.

View File

@ -0,0 +1,45 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# average embeddings of special null words for RDLM.
# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
import sys
import os
import numpy
def load_model(model_file):
return nplm.NeuralLM.from_file(model_file)
def get_weights(path, vocab, len_context):
d = [[0]*vocab for i in range(len_context)]
for line in open(path):
for i, word in enumerate(line.split()[:-1]):
d[i][int(word)] += 1
return d
if __name__ == "__main__":
nplm_path = sys.argv[1]
model_input = sys.argv[2]
training_instances = sys.argv[3]
model_output = sys.argv[4]
sys.path.append(os.path.join(nplm_path,'python'))
import nplm
model = load_model(model_input)
len_context = len(open(training_instances).readline().split())-1
sys.stderr.write('reading ngrams...')
weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
sys.stderr.write('done\n')
for i in range(len_context):
index = model.word_to_index_input['<null_{0}>'.format(i)]
model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
sys.stderr.write('writing model...')
model.to_file(open(model_output,'w'))
sys.stderr.write('done\n')

View File

@ -0,0 +1,262 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import io
import argparse
try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET
def parse_arguments():
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
choices=['label', 'head'], required=True)
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
help='input layer vocabulary file (one item per line; first line \'<unk>\')')
parser.add_argument('--output_vocab', metavar='PATH', type=str,
help='output layer vocabulary file (default: use input layer vocabulary)')
parser.add_argument('--left_context', metavar='INT', type=int,
help='size of context vector for left siblings (default: %(default)s)', default=3)
parser.add_argument('--right_context', metavar='INT', type=int,
help='size of context vector for right siblings (default: %(default)s)', default=0)
parser.add_argument('--up_context', metavar='INT', type=int,
help='size of context vector for ancestors (default: %(default)s)', default=2)
parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
help='glue symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--ptkvz', action='store_true',
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
return parser.parse_args()
def escape_text(s):
s = s.replace('|','&#124;') # factor separator
s = s.replace('[','&#91;') # syntax non-terminal
s = s.replace(']','&#93;') # syntax non-terminal
s = s.replace('\'','&apos;') # xml special character
s = s.replace('"','&quot;') # xml special character
return s
# deterministic heuristic to get head of subtree
def get_head(xml, add_ptkvz):
head = None
preterminal = None
for child in xml:
if not len(child):
if head is not None:
continue
preterminal = child.get('label')
head = escape_text(child.text.strip())
elif add_ptkvz and head and child.get('label') == 'avz':
for grandchild in child:
if grandchild.get('label') == 'PTKVZ':
head = escape_text(grandchild.text.strip()) + head
break
return head, preterminal
def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
if len(xml):
# skip glue rules
if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
for child in xml:
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
return
# skip virtual nodes
if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
return
if not parent_heads:
parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
head, preterminal = get_head(xml, options.ptkvz)
if not head:
head = '<dummy_head>'
preterminal = head
elif head not in vocab:
head = preterminal
label = xml.get('label')
# syntactic n-gram for root node
int_list = []
int_list.extend([start_head_idx] * options.left_context)
int_list.extend([start_label_idx] * options.left_context)
int_list.extend([stop_head_idx] * options.right_context)
int_list.extend([stop_label_idx] * options.right_context)
int_list.extend(parent_heads)
int_list.extend(parent_labels)
if options.mode == 'label':
int_list.append(output_vocab.get(label, 0))
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
elif options.mode == 'head' and not head == '<dummy_head>':
int_list.append(vocab.get(label, 0))
int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(head, 0))
parent_labels.append(vocab.get(label, 0))
# virtual start/end-of-subtree tag
if len(xml) > 0:
if options.right_context:
start = ET.Element('tree')
start2 = ET.Element('tree')
start.set('label','<start_label>')
start2.set('label','XY')
start2.text = '<start_head>'
start.append(start2)
xml.insert(0,start)
if options.left_context:
end = ET.Element('tree')
end2 = ET.Element('tree')
end.set('label','<stop_label>')
end2.set('label','XY')
end2.text = '<stop_head>'
end.append(end2)
xml.append(end)
heads = []
preterminals = []
labels = []
for child in xml:
if not len(child):
# mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
head_child = '<head_head>'
preterminal_child = head_child
child_label = '<head_label>'
else:
head_child, preterminal_child = get_head(child, options.ptkvz)
child_label = child.get('label')
if head_child is None:
head_child = '<dummy_head>'
heads.append(head_child)
preterminals.append(preterminal_child)
labels.append(child_label)
heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
#ancestor context is same for all children
up_heads = parent_heads[-options.up_context:]
up_labels = parent_labels[-options.up_context:]
for i,child in enumerate(xml):
# skip some special symbols, but recursively extract n-grams for its children
if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
parent_heads.append(vocab.get(heads[i], 0))
parent_labels.append(vocab.get(labels[i], 0))
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
parent_heads.pop()
parent_labels.pop()
continue
previous_heads = heads_idx[max(0,i-options.left_context):i]
previous_labels = labels_idx[max(0,i-options.left_context):i]
subsequent_heads = heads_idx[i+1:i+options.right_context+1]
subsequent_labels = labels_idx[i+1:i+options.right_context+1]
if len(previous_heads) < options.left_context:
previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
if len(subsequent_heads) < options.right_context:
subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
int_list = []
int_list.extend(previous_heads)
int_list.extend(previous_labels)
int_list.extend(subsequent_heads)
int_list.extend(subsequent_labels)
int_list.extend(up_heads)
int_list.extend(up_labels)
if options.mode == 'label':
int_list.append(output_vocab.get(labels[i], 0))
elif options.mode == 'head':
int_list.append(vocab.get(labels[i], 0))
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_labels.append(vocab.get(labels[i], 0))
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
parent_heads.pop()
parent_labels.pop()
def load_vocab(path):
v = {}
for i,line in enumerate(io.open(path, encoding="UTF-8")):
v[line.strip()] = i
return v
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
options = parse_arguments()
vocab = load_vocab(options.vocab)
if options.output_vocab is None:
sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
output_vocab = vocab
else:
output_vocab = load_vocab(options.output_vocab)
start_head_idx = vocab.get("<start_head>", 0)
start_label_idx = vocab.get("<start_label>", 0)
stop_head_idx = vocab.get("<stop_head>", 0)
stop_label_idx = vocab.get("<stop_label>", 0)
i = 0
for line in sys.stdin:
if i and not i % 50000:
sys.stderr.write('.')
if i and not i % 1000000:
sys.stderr.write('{0}\n'.format(i))
if sys.version_info < (3, 0):
if line == b'\n':
continue
# hack for older moses versions with inconsistent encoding of "|"
line = line.replace(b'&bar;', b'&#124;')
else:
if line == '\n':
continue
# hack for older moses versions with inconsistent encoding of "|"
line = line.replace('&bar;', '&#124;')
xml = ET.fromstring(line)
get_syntactic_ngrams(xml, options, vocab, output_vocab)
i += 1

View File

@ -0,0 +1,169 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# extract 5 vocabulary files from parsed corpus in moses XML format
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import io
import argparse
from collections import Counter
try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET
def parse_arguments():
help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
help_text += " [PREFIX].preterminals: preterminal symbols\n";
help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
help_text += " [PREFIX].terminals: terminal symbols\n";
help_text += " [PREFIX].all: all of the above\n"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
help='input text (default: standard input).')
parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
help='output prefix (default: "vocab")')
parser.add_argument('--ptkvz', action="store_true",
help='special rule for German dependency trees: attach separable verb prefixes to verb')
args = parser.parse_args()
return args
def escape_text(s):
s = s.replace('|','&#124;') # factor separator
s = s.replace('[','&#91;') # syntax non-terminal
s = s.replace(']','&#93;') # syntax non-terminal
s = s.replace('\'','&apos;') # xml special character
s = s.replace('"','&quot;') # xml special character
return s
# deterministic heuristic to get head of subtree
def get_head(xml):
head = None
preterminal = None
for child in xml:
if not len(child):
if head is not None:
continue
preterminal = child.get('label')
head = escape_text(child.text.strip())
# hack for split compounds
elif child[-1].get('label') == 'SEGMENT':
return escape_text(child[-1].text.strip()), 'SEGMENT'
elif args.ptkvz and head and child.get('label') == 'avz':
for grandchild in child:
if grandchild.get('label') == 'PTKVZ':
head = escape_text(grandchild.text.strip()) + head
break
return head, preterminal
def get_vocab(xml):
if len(xml):
head, preterminal = get_head(xml)
if not head:
head = '<null>'
preterminal = '<null>'
heads[head] += 1
preterminals[preterminal] += 1
label = xml.get('label')
nonterminals[label] += 1
for child in xml:
if not len(child):
continue
get_vocab(child)
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
args = parse_arguments()
heads = Counter()
preterminals = Counter()
nonterminals = Counter()
i = 0
for line in args.input:
if i and not i % 50000:
sys.stderr.write('.')
if i and not i % 1000000:
sys.stderr.write('{0}\n'.format(i))
if line == '\n':
continue
# hack for older moses versions with inconsistent encoding of "|"
line = line.replace('&bar;', '&#124;')
xml = ET.fromstring(line)
get_vocab(xml)
i += 1
special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
for i in range(30):
special_tokens.append('<null_{0}>'.format(i))
f = io.open(args.output + '.special', 'w', encoding='UTF-8')
for item in special_tokens:
f.write(item + '\n')
f.close()
f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
for item in sorted(preterminals, key=preterminals.get, reverse=True):
f.write(item + '\n')
f.close()
f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
f.write(item + '\n')
f.close()
f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
for item in sorted(heads, key=heads.get, reverse=True):
f.write(item + '\n')
f.close()
f = io.open(args.output + '.all', 'w', encoding='UTF-8')
special_tokens_set = set(special_tokens)
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
if item not in special_tokens:
special_tokens.append(item)
special_tokens_set.add(item)
for item in sorted(preterminals, key=preterminals.get, reverse=True):
if item not in special_tokens:
special_tokens.append(item)
special_tokens_set.add(item)
for item in special_tokens:
f.write(item + '\n')
i = len(special_tokens)
for item in sorted(heads, key=heads.get, reverse=True):
if item in special_tokens_set:
continue
i += 1
f.write(item + '\n')
f.close()

View File

@ -0,0 +1,65 @@
#!/bin/bash
if [ $# -eq 2 ]; then
OUTFILE=$1
WORKDIR=$2
else
echo "usage: $0 <outfile> <working_directory>"
exit 1
fi
NPLM=/path/to/nplm
MOSES_ROOT=/path/to/mosesdecoder
INFILE=/path/to/file/in/moses/xml/format
VALIDATIONFILE=/path/to/file/in/moses/xml/format
#TESTFILE1=/path/to/file/in/moses/xml/format
#TESTFILE2=/path/to/file/in/moses/xml/format
PREFIX=$(basename $OUTFILE)
EPOCHS=2
INPUT_VOCAB_SIZE=500000
OUTPUT_VOCAB_SIZE=500000
MINIBATCH_SIZE=1000
NOISE=100
HIDDEN=0
INPUT_EMBEDDING=150
OUTPUT_EMBEDDING=750
THREADS=4
MODE=head
UP_CONTEXT=2
LEFT_CONTEXT=3
RIGHT_CONTEXT=0
mkdir -p $WORKDIR
python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
--num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
--input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
--learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
--input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
if [[ $TESTFILE1 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi
if [[ $TESTFILE2 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi

View File

@ -0,0 +1,72 @@
#!/bin/bash
if [ $# -eq 2 ]; then
OUTFILE=$1
WORKDIR=$2
else
echo "usage: $0 <outfile> <working_directory>"
exit 1
fi
NPLM=/path/to/nplm
MOSES_ROOT=/path/to/mosesdecoder
INFILE=/path/to/file/in/moses/xml/format
VALIDATIONFILE=/path/to/file/in/moses/xml/format
#TESTFILE1=/path/to/file/in/moses/xml/format
#TESTFILE2=/path/to/file/in/moses/xml/format
PREFIX=$(basename $OUTFILE)
EPOCHS=1
INPUT_VOCAB_SIZE=500000
OUTPUT_VOCAB_SIZE=75
MINIBATCH_SIZE=1000
NOISE=50
HIDDEN=0
INPUT_EMBEDDING=150
OUTPUT_EMBEDDING=750
THREADS=4
MODE=label
UP_CONTEXT=2
LEFT_CONTEXT=3
RIGHT_CONTEXT=0
mkdir -p $WORKDIR
python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals |
grep -v "^<null" |
grep -v "^<root" |
grep -v "^<start_head" |
grep -v "^<dummy" |
grep -v "^<head_head" |
grep -v "^<stop_head" |
head -n $OUTPUT_VOCAB_SIZE > $WORKDIR/vocab.output
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
--num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
--input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
--learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
--input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
if [[ $TESTFILE1 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi
if [[ $TESTFILE2 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi

View File

@ -2153,8 +2153,8 @@ sub create_ini {
# SyntaxInputWeight FF
if ($_USE_SYNTAX_INPUT_WEIGHT_FEATURE) {
$feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight$i\n";
$weight_spec .= "SyntaxInputWeight$i= 0.1\n";
$feature_spec .= "SyntaxInputWeight name=SyntaxInputWeight0\n";
$weight_spec .= "SyntaxInputWeight0= 0.1\n";
}
# generation model