mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-06 19:49:41 +03:00
919 lines
36 KiB
C++
919 lines
36 KiB
C++
#include "RDLM.h"
|
|
#include <vector>
|
|
#include "moses/StaticData.h"
|
|
#include "moses/ScoreComponentCollection.h"
|
|
#include "moses/ChartHypothesis.h"
|
|
#include "moses/InputFileStream.h"
|
|
#include "moses/Util.h"
|
|
#include "util/exception.hh"
|
|
#include "neuralTM.h"
|
|
|
|
namespace Moses
|
|
{
|
|
|
|
namespace rdlm
|
|
{
|
|
ThreadLocal::ThreadLocal(nplm::neuralTM *lm_head_base_instance_, nplm::neuralTM *lm_label_base_instance_, bool normalizeHeadLM, bool normalizeLabelLM, int cacheSize)
|
|
{
|
|
lm_head = new nplm::neuralTM(*lm_head_base_instance_);
|
|
lm_label = new nplm::neuralTM(*lm_label_base_instance_);
|
|
lm_head->set_normalization(normalizeHeadLM);
|
|
lm_label->set_normalization(normalizeLabelLM);
|
|
lm_head->set_cache(cacheSize);
|
|
lm_label->set_cache(cacheSize);
|
|
}
|
|
|
|
ThreadLocal::~ThreadLocal()
|
|
{
|
|
delete lm_head;
|
|
delete lm_label;
|
|
}
|
|
|
|
}
|
|
|
|
typedef Eigen::Map<Eigen::Matrix<int,Eigen::Dynamic,1> > EigenMap;
|
|
|
|
RDLM::~RDLM()
|
|
{
|
|
delete lm_head_base_instance_;
|
|
delete lm_label_base_instance_;
|
|
}
|
|
|
|
void RDLM::Load(AllOptions::ptr const& opts)
|
|
{
|
|
|
|
lm_head_base_instance_ = new nplm::neuralTM();
|
|
lm_head_base_instance_->read(m_path_head_lm);
|
|
|
|
m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words();
|
|
// std::cerr << "Does head RDLM share vocabulary for input/output? " << m_sharedVocab << std::endl;
|
|
|
|
lm_label_base_instance_ = new nplm::neuralTM();
|
|
lm_label_base_instance_->read(m_path_label_lm);
|
|
|
|
if (m_premultiply) {
|
|
lm_head_base_instance_->premultiply();
|
|
lm_label_base_instance_->premultiply();
|
|
}
|
|
|
|
lm_head_base_instance_->set_cache(m_cacheSize);
|
|
lm_label_base_instance_->set_cache(m_cacheSize);
|
|
|
|
StaticData &staticData = StaticData::InstanceNonConst();
|
|
if (staticData.GetTreeStructure() == NULL) {
|
|
staticData.SetTreeStructure(this);
|
|
}
|
|
|
|
offset_up_head = 2*m_context_left + 2*m_context_right;
|
|
offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up;
|
|
|
|
size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2;
|
|
size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1;
|
|
|
|
UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(),
|
|
"Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head);
|
|
UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(),
|
|
"Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label);
|
|
|
|
//get int value of commonly used tokens
|
|
static_head_null.resize(size_head);
|
|
for (unsigned int i = 0; i < size_head; i++) {
|
|
char numstr[20];
|
|
sprintf(numstr, "<null_%d>", i);
|
|
static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr);
|
|
}
|
|
|
|
static_label_null.resize(size_label);
|
|
for (unsigned int i = 0; i < size_label; i++) {
|
|
char numstr[20];
|
|
sprintf(numstr, "<null_%d>", i);
|
|
static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
|
|
}
|
|
|
|
static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head.GetString(0).as_string());
|
|
|
|
static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
|
|
static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
|
|
|
|
static_head_head = lm_head_base_instance_->lookup_input_word("<head_head>");
|
|
static_head_label = lm_head_base_instance_->lookup_input_word("<head_label>");
|
|
static_head_label_output = lm_label_base_instance_->lookup_output_word("<head_label>");
|
|
|
|
static_stop_head = lm_head_base_instance_->lookup_input_word("<stop_head>");
|
|
static_stop_label = lm_head_base_instance_->lookup_input_word("<stop_label>");
|
|
static_stop_label_output = lm_label_base_instance_->lookup_output_word("<stop_label>");
|
|
static_start_label_output = lm_label_base_instance_->lookup_output_word("<start_label>");
|
|
|
|
static_root_head = lm_head_base_instance_->lookup_input_word("<root_head>");
|
|
static_root_label = lm_head_base_instance_->lookup_input_word("<root_label>");
|
|
|
|
// just score provided file, then exit.
|
|
if (!m_debugPath.empty()) {
|
|
ScoreFile(m_debugPath);
|
|
exit(1);
|
|
}
|
|
|
|
// {
|
|
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA europäische]] [NN Zeit]]]"));
|
|
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
|
|
// TreePointer mytree4 (new InternalTree("[pred [det [ART die]] [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [attr [ADJA]] [NN Zeit]]]"));
|
|
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [VAFIN bin] [pred]]"));
|
|
//
|
|
// rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
|
|
// if (!thread_objects) {
|
|
// thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
|
|
// thread_objects_backend_.reset(thread_objects);
|
|
// }
|
|
//
|
|
// #ifdef WITH_THREADS
|
|
// //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope
|
|
// m_accessLock.lock_shared();
|
|
// #endif
|
|
//
|
|
// size_t boundary_hash(0);
|
|
// boost::array<float, 4> score;
|
|
// score.fill(0);
|
|
// std::cerr << "scoring: " << mytree3->GetString() << std::endl;
|
|
// std::vector<TreePointer> previous_trees;
|
|
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
|
|
// Score(mytree3.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// previous_trees.push_back(mytree3);
|
|
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
|
|
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
|
|
// Score(mytree4.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// mytree4->Combine(previous_trees);
|
|
// previous_trees.clear();
|
|
// previous_trees.push_back(mytree4);
|
|
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
|
|
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
|
|
//
|
|
// score[1] = 0;
|
|
// score[3] = 0;
|
|
// Score(mytree2.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// score[0] = 0;
|
|
// score[1] = 0;
|
|
// score[2] = 0;
|
|
// score[3] = 0;
|
|
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
|
|
//
|
|
// Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// #ifdef WITH_THREADS
|
|
// m_accessLock.unlock_shared();
|
|
// #endif
|
|
// }
|
|
//
|
|
// {
|
|
// std::cerr << "BINARIZED\n\n";
|
|
// TreePointer mytree (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA europäische]] [NN Zeit]]]]]]"));
|
|
// TreePointer mytree3 (new InternalTree("[ADJA europäische]"));
|
|
// TreePointer mytree4 (new InternalTree("[^pred [attr [adv [adv [PTKNEG nicht]] [ADV fast]] [ADJA neue]] [^pred [attr [ADJA]] [NN Zeit]]]"));
|
|
// TreePointer mytree2 (new InternalTree("[vroot [subj [PPER ich]] [^vroot [VAFIN bin] [pred [det [ART die]] [^pred]]]]"));
|
|
//
|
|
// rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
|
|
// if (!thread_objects) {
|
|
// thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
|
|
// thread_objects_backend_.reset(thread_objects);
|
|
// }
|
|
//
|
|
// #ifdef WITH_THREADS
|
|
// //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope
|
|
// m_accessLock.lock_shared();
|
|
// #endif
|
|
//
|
|
// size_t boundary_hash(0);
|
|
// boost::array<float, 4> score;
|
|
// score.fill(0);
|
|
// std::cerr << "scoring: " << mytree3->GetString() << std::endl;
|
|
// std::vector<TreePointer> previous_trees;
|
|
// TreePointerMap back_pointers = AssociateLeafNTs(mytree3.get(), previous_trees);
|
|
// Score(mytree3.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// previous_trees.push_back(mytree3);
|
|
// back_pointers = AssociateLeafNTs(mytree4.get(), previous_trees);
|
|
// std::cerr << "scoring: " << mytree4->GetString() << std::endl;
|
|
// Score(mytree4.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// mytree4->Combine(previous_trees);
|
|
// previous_trees.clear();
|
|
// previous_trees.push_back(mytree4);
|
|
// back_pointers = AssociateLeafNTs(mytree2.get(), previous_trees);
|
|
// std::cerr << "scoring: " << mytree2->GetString() << std::endl;
|
|
//
|
|
// score[1] = 0;
|
|
// score[3] = 0;
|
|
// Score(mytree2.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// score[0] = 0;
|
|
// score[1] = 0;
|
|
// score[2] = 0;
|
|
// score[3] = 0;
|
|
// std::cerr << "scoring: " << mytree->GetString() << std::endl;
|
|
//
|
|
// Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
// std::cerr << "head LM: " << score[0] << " label LM: " << score[2] << " approx: " << score[1] << " - " << score[3] << std::endl;
|
|
//
|
|
// #ifdef WITH_THREADS
|
|
// m_accessLock.unlock_shared();
|
|
// #endif
|
|
//
|
|
// }
|
|
// UTIL_THROW2("Finished");
|
|
|
|
}
|
|
|
|
|
|
void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float, 4> &score, size_t &boundary_hash, rdlm::ThreadLocal &thread_objects, int num_virtual, int rescoring_levels) const
|
|
{
|
|
|
|
// ignore terminal nodes
|
|
if (root->IsTerminal()) {
|
|
return;
|
|
}
|
|
|
|
// ignore glue rules
|
|
if (root->GetLabel() == m_glueSymbol) {
|
|
// recursion
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
|
|
Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels);
|
|
}
|
|
return;
|
|
}
|
|
|
|
std::vector<int> &ancestor_heads = thread_objects.ancestor_heads;
|
|
std::vector<int> &ancestor_labels = thread_objects.ancestor_labels;
|
|
|
|
// ignore virtual nodes (in binarization; except if it's the root)
|
|
if (m_binarized && root->GetLabel().GetString(0).as_string()[0] == '^' && !ancestor_heads.empty()) {
|
|
// recursion
|
|
if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
|
|
root = back_pointers.find(root)->second.get();
|
|
rescoring_levels = m_context_up-1;
|
|
}
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
|
|
Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// ignore start/end of sentence tags
|
|
if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) {
|
|
return;
|
|
}
|
|
|
|
|
|
// ignore preterminal node (except if we're scoring root nodes)
|
|
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
|
|
// root of tree: score without context
|
|
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
|
|
std::vector<int> & ngram = thread_objects.ngram;
|
|
ngram = static_head_null;
|
|
ngram.back() = Factor2ID(root->GetChildren()[0]->GetLabel()[m_factorType], HEAD_OUTPUT);
|
|
if (m_isPretermBackoff && ngram.back() == 0) {
|
|
ngram.back() = Factor2ID(root->GetLabel()[m_factorType], HEAD_OUTPUT);
|
|
}
|
|
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
|
|
std::vector<int>::iterator it = ngram.begin();
|
|
std::fill_n(it, m_context_left, static_start_head);
|
|
it += m_context_left;
|
|
std::fill_n(it, m_context_left, static_start_label);
|
|
it += m_context_left;
|
|
std::fill_n(it, m_context_right, static_stop_head);
|
|
it += m_context_right;
|
|
std::fill_n(it, m_context_right, static_stop_label);
|
|
it += m_context_right;
|
|
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
|
|
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
|
|
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
|
|
}
|
|
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
|
|
score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
} else {
|
|
boost::hash_combine(boundary_hash, ngram.back());
|
|
score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
}
|
|
}
|
|
return;
|
|
// we only need to re-visit previous hypotheses if we have more context available.
|
|
} else if (root->IsLeafNT()) {
|
|
if (m_context_up > 1 && ancestor_heads.size()) {
|
|
root = back_pointers.find(root)->second.get();
|
|
// ignore preterminal node
|
|
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
|
|
return;
|
|
}
|
|
rescoring_levels = m_context_up-1;
|
|
} else {
|
|
return;
|
|
}
|
|
}
|
|
|
|
|
|
std::pair<int,int> head_ids;
|
|
bool found = GetHead(root, back_pointers, head_ids);
|
|
if (!found) {
|
|
head_ids = std::make_pair(static_dummy_head, static_dummy_head);
|
|
}
|
|
|
|
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
|
|
const StringPiece & head_label = root->GetLabel().GetString(0);
|
|
bool virtual_head = false;
|
|
int reached_end = 0;
|
|
int label_idx, label_idx_out;
|
|
if (m_binarized && head_label[0] == '^') {
|
|
virtual_head = true;
|
|
if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) {
|
|
reached_end = 1; //indicate that we've seen the first symbol of the RHS
|
|
} else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) {
|
|
reached_end = 2; // indicate that we've seen the last symbol of the RHS
|
|
}
|
|
// with 'full' binarization, direction is encoded in 2nd char
|
|
StringPiece clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1);
|
|
label_idx = lm_label_base_instance_->lookup_input_word(clipped_label.as_string());
|
|
label_idx_out = lm_label_base_instance_->lookup_output_word(clipped_label.as_string());
|
|
} else {
|
|
reached_end = 3; // indicate that we've seen first and last symbol of the RHS
|
|
label_idx = Factor2ID(root->GetLabel()[0], LABEL_INPUT);
|
|
label_idx_out = Factor2ID(root->GetLabel()[0], LABEL_OUTPUT);
|
|
}
|
|
|
|
int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first;
|
|
|
|
// root of tree: score without context
|
|
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
|
|
std::vector<int> & ngram = thread_objects.ngram;
|
|
ngram = static_label_null;
|
|
ngram.back() = label_idx_out;
|
|
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
|
|
std::vector<int>::iterator it = ngram.begin();
|
|
std::fill_n(it, m_context_left, static_start_head);
|
|
it += m_context_left;
|
|
std::fill_n(it, m_context_left, static_start_label);
|
|
it += m_context_left;
|
|
std::fill_n(it, m_context_right, static_stop_head);
|
|
it += m_context_right;
|
|
std::fill_n(it, m_context_right, static_stop_label);
|
|
it += m_context_right;
|
|
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
|
|
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
|
|
score[2] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
} else {
|
|
boost::hash_combine(boundary_hash, ngram.back());
|
|
score[3] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
}
|
|
if (head_idx != static_dummy_head && head_idx != static_head_head) {
|
|
ngram.push_back(head_ids.second);
|
|
*(ngram.end()-2) = label_idx;
|
|
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
|
|
score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
} else {
|
|
boost::hash_combine(boundary_hash, ngram.back());
|
|
score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
}
|
|
}
|
|
}
|
|
|
|
ancestor_heads.push_back(head_idx);
|
|
ancestor_labels.push_back(label_idx);
|
|
|
|
if (virtual_head) {
|
|
num_virtual = m_context_up;
|
|
} else if (num_virtual) {
|
|
--num_virtual;
|
|
}
|
|
|
|
|
|
// fill ancestor context (same for all children)
|
|
if (context_up_nonempty < m_context_up) {
|
|
++context_up_nonempty;
|
|
}
|
|
size_t up_padding = m_context_up - context_up_nonempty;
|
|
|
|
std::vector<int> & ngram = thread_objects.ngram;
|
|
ngram = static_label_null;
|
|
|
|
std::vector<int>::iterator it = ngram.begin() + offset_up_head;
|
|
if (up_padding > 0) {
|
|
it += up_padding;
|
|
}
|
|
|
|
it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it);
|
|
|
|
if (up_padding > 0) {
|
|
it += up_padding;
|
|
}
|
|
|
|
it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it);
|
|
|
|
// create vectors of head/label IDs of all children
|
|
int num_children = root->GetLength();
|
|
|
|
// get number of children after unbinarization
|
|
if (m_binarized) {
|
|
num_children = 0;
|
|
UnbinarizedChildren real_children(root, back_pointers, m_binarized, thread_objects.stack);
|
|
for (std::vector<TreePointer>::const_iterator it = real_children.begin(); !real_children.ended(); it = ++real_children) {
|
|
num_children++;
|
|
}
|
|
}
|
|
|
|
if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++; //also predict start label
|
|
if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++; //also predict end label
|
|
std::vector<int> & heads = thread_objects.heads;
|
|
std::vector<int> & labels = thread_objects.labels;
|
|
std::vector<int> & heads_output = thread_objects.heads_output;
|
|
std::vector<int> & labels_output = thread_objects.labels_output;
|
|
|
|
heads.resize(num_children);
|
|
labels.resize(num_children);
|
|
heads_output.resize(num_children);
|
|
labels_output.resize(num_children);
|
|
|
|
GetChildHeadsAndLabels(root, back_pointers, reached_end, thread_objects);
|
|
|
|
//left padding; only need to add this initially
|
|
if (reached_end == 1 || reached_end == 3) {
|
|
std::fill_n(ngram.begin(), m_context_left, static_start_head);
|
|
std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label);
|
|
}
|
|
size_t left_padding = m_context_left;
|
|
size_t left_offset = 0;
|
|
size_t right_offset = std::min(heads.size(), m_context_right + 1);
|
|
size_t right_padding = m_context_right + 1 - right_offset;
|
|
|
|
// construct context of label model and predict label
|
|
for (size_t i = 0; i != heads.size(); i++) {
|
|
|
|
std::vector<int>::iterator it = ngram.begin();
|
|
|
|
if (left_padding > 0) {
|
|
it += left_padding;
|
|
}
|
|
|
|
it = std::copy(heads.begin()+left_offset, heads.begin()+i, it);
|
|
|
|
if (left_padding > 0) {
|
|
it += left_padding;
|
|
}
|
|
|
|
it = std::copy(labels.begin()+left_offset, labels.begin()+i, it);
|
|
|
|
it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it);
|
|
|
|
if (right_padding > 0) {
|
|
if (reached_end == 2 || reached_end == 3) {
|
|
std::fill_n(it, right_padding, static_stop_head);
|
|
it += right_padding;
|
|
} else {
|
|
std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it);
|
|
}
|
|
}
|
|
|
|
it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it);
|
|
|
|
if (right_padding > 0) {
|
|
if (reached_end == 2 || reached_end == 3) {
|
|
std::fill_n(it, right_padding, static_stop_label);
|
|
it += right_padding;
|
|
} else {
|
|
std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it);
|
|
}
|
|
}
|
|
|
|
ngram.back() = labels_output[i];
|
|
|
|
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
|
|
score[2] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
} else {
|
|
boost::hash_combine(boundary_hash, ngram.back());
|
|
score[3] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
}
|
|
|
|
// construct context of head model and predict head
|
|
if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) {
|
|
|
|
ngram.back() = labels[i];
|
|
ngram.push_back(heads_output[i]);
|
|
|
|
if (ancestor_labels.size() >= m_context_up && !num_virtual) {
|
|
score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
} else {
|
|
boost::hash_combine(boundary_hash, ngram.back());
|
|
score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
|
|
}
|
|
ngram.pop_back();
|
|
}
|
|
|
|
// next time, we need to add less start symbol padding
|
|
if (left_padding)
|
|
left_padding--;
|
|
else
|
|
left_offset++;
|
|
|
|
if (right_offset < heads.size())
|
|
right_offset++;
|
|
else
|
|
right_padding++;
|
|
}
|
|
|
|
|
|
if (rescoring_levels == 1) {
|
|
ancestor_heads.pop_back();
|
|
ancestor_labels.pop_back();
|
|
return;
|
|
}
|
|
// recursion
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
|
|
Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels - 1);
|
|
}
|
|
ancestor_heads.pop_back();
|
|
ancestor_labels.pop_back();
|
|
}
|
|
|
|
bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const
|
|
{
|
|
InternalTree *tree;
|
|
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
|
|
if ((*it)->IsLeafNT()) {
|
|
tree = back_pointers.find(it->get())->second.get();
|
|
} else {
|
|
tree = it->get();
|
|
}
|
|
|
|
if (m_binarized && tree->GetLabel().GetString(0).as_string()[0] == '^') {
|
|
bool found = GetHead(tree, back_pointers, IDs);
|
|
if (found) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head
|
|
// if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned
|
|
else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal()) {
|
|
GetIDs(tree->GetChildren()[0]->GetLabel(), tree->GetLabel(), IDs);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, rdlm::ThreadLocal &thread_objects) const
|
|
{
|
|
std::pair<int,int> child_ids;
|
|
size_t j = 0;
|
|
|
|
std::vector<int> & heads = thread_objects.heads;
|
|
std::vector<int> & labels = thread_objects.labels;
|
|
std::vector<int> & heads_output = thread_objects.heads_output;
|
|
std::vector<int> & labels_output = thread_objects.labels_output;
|
|
|
|
// score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
|
|
if (m_context_right && (reached_end == 1 || reached_end == 3)) {
|
|
heads[j] = static_start_head;
|
|
labels[j] = static_start_label;
|
|
labels_output[j] = static_start_label_output;
|
|
j++;
|
|
}
|
|
|
|
UnbinarizedChildren real_children(root, back_pointers, m_binarized, thread_objects.stack);
|
|
|
|
// extract head words / labels
|
|
for (std::vector<TreePointer>::const_iterator itx = real_children.begin(); !real_children.ended(); itx = ++real_children) {
|
|
if ((*itx)->IsTerminal()) {
|
|
std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl;
|
|
std::cerr << "children: ";
|
|
for (std::vector<TreePointer>::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) {
|
|
std::cerr << (*itx2)->GetLabel() << " ";
|
|
}
|
|
std::cerr << std::endl;
|
|
// resize vectors (should we throw exception instead?)
|
|
heads.pop_back();
|
|
labels.pop_back();
|
|
heads_output.pop_back();
|
|
labels_output.pop_back();
|
|
continue;
|
|
}
|
|
InternalTree* child = itx->get();
|
|
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
|
|
if ((*itx)->IsLeafNT()) {
|
|
child = back_pointers.find(itx->get())->second.get();
|
|
}
|
|
|
|
// preterminal node
|
|
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
|
|
heads[j] = static_head_head;
|
|
labels[j] = static_head_label;
|
|
labels_output[j] = static_head_label_output;
|
|
j++;
|
|
continue;
|
|
}
|
|
|
|
bool found = GetHead(child, back_pointers, child_ids);
|
|
if (!found) {
|
|
child_ids = std::make_pair(static_dummy_head, static_dummy_head);
|
|
}
|
|
|
|
labels[j] = Factor2ID(child->GetLabel()[0], LABEL_INPUT);
|
|
labels_output[j] = Factor2ID(child->GetLabel()[0], LABEL_OUTPUT);
|
|
heads[j] = child_ids.first;
|
|
heads_output[j] = child_ids.second;
|
|
j++;
|
|
}
|
|
|
|
// score end label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
|
|
if (m_context_left && (reached_end == 2 || reached_end == 3)) {
|
|
heads[j] = static_stop_head;
|
|
labels[j] = static_stop_label;
|
|
labels_output[j] = static_stop_label_output;
|
|
}
|
|
}
|
|
|
|
|
|
void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const
|
|
{
|
|
IDs.first = Factor2ID(head[m_factorType], HEAD_INPUT);
|
|
if (m_isPretermBackoff && IDs.first == 0) {
|
|
IDs.first = Factor2ID(preterminal[0], HEAD_INPUT);
|
|
}
|
|
if (m_sharedVocab) {
|
|
IDs.second = IDs.first;
|
|
} else {
|
|
IDs.second = Factor2ID(head[m_factorType], HEAD_OUTPUT);
|
|
if (m_isPretermBackoff && IDs.second == 0) {
|
|
IDs.second = Factor2ID(preterminal[0], HEAD_OUTPUT);
|
|
}
|
|
}
|
|
}
|
|
|
|
// map from moses factor to NPLM ID; use vectors as cache to avoid hash table lookups
|
|
int RDLM::Factor2ID(const Factor * const factor, int model_type) const
|
|
{
|
|
size_t ID = factor->GetId();
|
|
int ret;
|
|
|
|
std::vector<int>* cache = NULL;
|
|
switch(model_type) {
|
|
case LABEL_INPUT:
|
|
cache = &factor2id_label_input;
|
|
break;
|
|
case LABEL_OUTPUT:
|
|
cache = &factor2id_label_output;
|
|
break;
|
|
case HEAD_INPUT:
|
|
cache = &factor2id_head_input;
|
|
break;
|
|
case HEAD_OUTPUT:
|
|
cache = &factor2id_head_output;
|
|
break;
|
|
}
|
|
|
|
try {
|
|
ret = cache->at(ID);
|
|
} catch (const std::out_of_range& oor) {
|
|
#ifdef WITH_THREADS //need to resize cache; write lock
|
|
m_accessLock.unlock_shared();
|
|
m_accessLock.lock();
|
|
#endif
|
|
cache->resize(ID*2, -1);
|
|
#ifdef WITH_THREADS //go back to read lock
|
|
m_accessLock.unlock();
|
|
m_accessLock.lock_shared();
|
|
#endif
|
|
ret = -1;
|
|
}
|
|
if (ret == -1) {
|
|
switch(model_type) {
|
|
case LABEL_INPUT:
|
|
ret = lm_label_base_instance_->lookup_input_word(factor->GetString().as_string());
|
|
break;
|
|
case LABEL_OUTPUT:
|
|
ret = lm_label_base_instance_->lookup_output_word(factor->GetString().as_string());
|
|
break;
|
|
case HEAD_INPUT:
|
|
ret = lm_head_base_instance_->lookup_input_word(factor->GetString().as_string());
|
|
break;
|
|
case HEAD_OUTPUT:
|
|
ret = lm_head_base_instance_->lookup_output_word(factor->GetString().as_string());
|
|
break;
|
|
}
|
|
(*cache)[ID] = ret;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
void RDLM::PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const
|
|
{
|
|
for (size_t i = 0; i < ngram.size()-1; i++) {
|
|
std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " ";
|
|
}
|
|
std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " ";
|
|
|
|
for (size_t i = 0; i < ngram.size(); i++) {
|
|
std::cerr << ngram[i] << " ";
|
|
}
|
|
std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl;
|
|
}
|
|
|
|
|
|
RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const
|
|
{
|
|
|
|
TreePointerMap ret;
|
|
std::vector<TreePointer>::iterator it;
|
|
bool found = false;
|
|
InternalTree::leafNT next_leafNT(root);
|
|
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
|
found = next_leafNT(it);
|
|
if (found) {
|
|
ret[it->get()] = *it_prev;
|
|
} else {
|
|
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void RDLM::ScoreFile(std::string &path)
|
|
{
|
|
InputFileStream inStream(path);
|
|
rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
|
|
if (!thread_objects) {
|
|
thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
|
|
thread_objects_backend_.reset(thread_objects);
|
|
}
|
|
std::string line, null;
|
|
thread_objects->ancestor_heads.resize(0);
|
|
thread_objects->ancestor_labels.resize(0);
|
|
thread_objects->ancestor_heads.resize(m_context_up, static_root_head);
|
|
thread_objects->ancestor_labels.resize(m_context_up, static_root_label);
|
|
#ifdef WITH_THREADS
|
|
//read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope
|
|
m_accessLock.lock_shared();
|
|
#endif
|
|
while(getline(inStream, line)) {
|
|
TreePointerMap back_pointers;
|
|
boost::array<float, 4> score;
|
|
score.fill(0);
|
|
InternalTree* mytree (new InternalTree(line));
|
|
size_t boundary_hash = 0;
|
|
Score(mytree, back_pointers, score, boundary_hash, *thread_objects);
|
|
std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl;
|
|
}
|
|
#ifdef WITH_THREADS
|
|
m_accessLock.unlock_shared();
|
|
#endif
|
|
}
|
|
|
|
|
|
void RDLM::SetParameter(const std::string& key, const std::string& value)
|
|
{
|
|
std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
|
|
if (key == "tuneable") {
|
|
m_tuneable = Scan<bool>(value);
|
|
} else if (key == "filterable") { //ignore
|
|
} else if (key == "path_head_lm") {
|
|
m_path_head_lm = value;
|
|
} else if (key == "path_label_lm") {
|
|
m_path_label_lm = value;
|
|
} else if (key == "backoff") {
|
|
m_isPretermBackoff = Scan<bool>(value);
|
|
} else if (key == "context_up") {
|
|
m_context_up = Scan<size_t>(value);
|
|
} else if (key == "context_left") {
|
|
m_context_left = Scan<size_t>(value);
|
|
} else if (key == "context_right") {
|
|
m_context_right = Scan<size_t>(value);
|
|
} else if (key == "debug_path") {
|
|
m_debugPath = value;
|
|
} else if (key == "premultiply") {
|
|
m_premultiply = Scan<bool>(value);
|
|
} else if (key == "rerank") {
|
|
m_rerank = Scan<bool>(value);
|
|
} else if (key == "normalize_head_lm") {
|
|
m_normalizeHeadLM = Scan<bool>(value);
|
|
} else if (key == "normalize_label_lm") {
|
|
m_normalizeLabelLM = Scan<bool>(value);
|
|
} else if (key == "binarized") {
|
|
if (value == "left")
|
|
m_binarized = 1;
|
|
else if (value == "right")
|
|
m_binarized = 2;
|
|
else if (value == "full")
|
|
m_binarized = 3;
|
|
else
|
|
UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
|
|
} else if (key == "glue_symbol") {
|
|
m_glueSymbolString = value;
|
|
} else if (key == "factor") {
|
|
m_factorType = Scan<FactorType>(value);
|
|
} else if (key == "cache_size") {
|
|
m_cacheSize = Scan<int>(value);
|
|
} else {
|
|
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
|
|
}
|
|
}
|
|
|
|
|
|
FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
|
|
, int featureID /* used to index the state in the previous hypotheses */
|
|
, ScoreComponentCollection* accumulator) const
|
|
{
|
|
if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
|
|
const std::string *tree = property->GetValueString();
|
|
TreePointer mytree (boost::make_shared<InternalTree>(*tree));
|
|
|
|
//get subtrees (in target order)
|
|
std::vector<TreePointer> previous_trees;
|
|
float prev_approx_head = 0, prev_approx_label = 0; //approximated (due to lack of context) LM costs from previous hypos
|
|
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
|
|
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
|
|
if (word.IsNonTerminal()) {
|
|
size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
|
|
const RDLMState* prev = static_cast<const RDLMState*>(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID));
|
|
previous_trees.push_back(prev->GetTree());
|
|
prev_approx_head -= prev->GetApproximateScoreHead();
|
|
prev_approx_label -= prev->GetApproximateScoreLabel();
|
|
}
|
|
}
|
|
size_t ff_idx = m_index; // accumulator->GetIndexes(this).first;
|
|
|
|
accumulator->PlusEquals(ff_idx, prev_approx_head);
|
|
accumulator->PlusEquals(ff_idx+1, prev_approx_label);
|
|
|
|
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag));
|
|
|
|
TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees);
|
|
boost::array<float, 4> score; // score_head, approx_score_head, score_label, approx_score_label
|
|
score.fill(0);
|
|
//hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning.
|
|
size_t boundary_hash = 0;
|
|
if (!m_rerank) {
|
|
#ifdef WITH_THREADS
|
|
//read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope
|
|
m_accessLock.lock_shared();
|
|
#endif
|
|
rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
|
|
if (!thread_objects) {
|
|
thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
|
|
thread_objects_backend_.reset(thread_objects);
|
|
}
|
|
thread_objects->ancestor_heads.resize(0);
|
|
thread_objects->ancestor_labels.resize(0);
|
|
thread_objects->ancestor_heads.resize((full_sentence ? m_context_up : 0), static_root_head);
|
|
thread_objects->ancestor_labels.resize((full_sentence ? m_context_up : 0), static_root_label);
|
|
Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
#ifdef WITH_THREADS
|
|
m_accessLock.unlock_shared();
|
|
#endif
|
|
accumulator->PlusEquals(ff_idx, score[0] + score[1]);
|
|
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
|
|
}
|
|
mytree->Combine(previous_trees);
|
|
if (m_rerank && full_sentence) {
|
|
#ifdef WITH_THREADS
|
|
//read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope
|
|
m_accessLock.lock_shared();
|
|
#endif
|
|
rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
|
|
if (!thread_objects) {
|
|
thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
|
|
thread_objects_backend_.reset(thread_objects);
|
|
}
|
|
thread_objects->ancestor_heads.resize(0);
|
|
thread_objects->ancestor_labels.resize(0);
|
|
thread_objects->ancestor_heads.resize((full_sentence ? m_context_up : 0), static_root_head);
|
|
thread_objects->ancestor_labels.resize((full_sentence ? m_context_up : 0), static_root_label);
|
|
Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects);
|
|
#ifdef WITH_THREADS
|
|
m_accessLock.unlock_shared();
|
|
#endif
|
|
accumulator->PlusEquals(ff_idx, score[0] + score[1]);
|
|
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
|
|
}
|
|
if (m_binarized && full_sentence) {
|
|
mytree->Unbinarize();
|
|
}
|
|
|
|
return new RDLMState(mytree, score[1], score[3], boundary_hash);
|
|
} else {
|
|
UTIL_THROW2("Error: RDLM active, but no internal tree structure found");
|
|
}
|
|
|
|
}
|
|
|
|
}
|