mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 21:03:22 +03:00
Update incremental search
This commit is contained in:
parent
877ade5327
commit
9340f9f465
@ -23,7 +23,7 @@
|
||||
#include "Word.h"
|
||||
#include "WordsRange.h"
|
||||
|
||||
namespace search { class Vertex; class VertexGenerator; }
|
||||
namespace search { class Vertex; }
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -41,7 +41,7 @@ class ChartCellLabel
|
||||
union Stack {
|
||||
const HypoList *cube; // cube pruning
|
||||
const search::Vertex *incr; // incremental search after filling.
|
||||
search::VertexGenerator *incr_generator; // incremental search during filling.
|
||||
void *incr_generator; // incremental search during filling.
|
||||
};
|
||||
|
||||
|
||||
|
@ -1,143 +0,0 @@
|
||||
#include "Fill.h"
|
||||
|
||||
#include "moses/ChartCellLabel.h"
|
||||
#include "moses/ChartCellLabelSet.h"
|
||||
#include "moses/TargetPhraseCollection.h"
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include "moses/Word.h"
|
||||
|
||||
#include "lm/model.hh"
|
||||
#include "search/context.hh"
|
||||
#include "search/note.hh"
|
||||
#include "search/rule.hh"
|
||||
#include "search/vertex.hh"
|
||||
#include "search/vertex_generator.hh"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
namespace Moses {
|
||||
namespace Incremental {
|
||||
|
||||
template <class Model> Fill<Model>::Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping)
|
||||
: context_(context), vocab_mapping_(vocab_mapping) {}
|
||||
|
||||
template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &) {
|
||||
std::vector<search::PartialVertex> vertices;
|
||||
vertices.reserve(nts.size());
|
||||
float below_score = 0.0;
|
||||
for (StackVec::const_iterator i(nts.begin()); i != nts.end(); ++i) {
|
||||
vertices.push_back((*i)->GetStack().incr->RootPartial());
|
||||
if (vertices.back().Empty()) return;
|
||||
below_score += vertices.back().Bound();
|
||||
}
|
||||
|
||||
std::vector<lm::WordIndex> words;
|
||||
for (TargetPhraseCollection::const_iterator p(targets.begin()); p != targets.end(); ++p) {
|
||||
words.clear();
|
||||
const TargetPhrase &phrase = **p;
|
||||
const AlignmentInfo::NonTermIndexMap &align = phrase.GetAlignNonTerm().GetNonTermIndexMap();
|
||||
search::PartialEdge edge(edges_.AllocateEdge(nts.size()));
|
||||
|
||||
size_t i = 0;
|
||||
bool bos = false;
|
||||
search::PartialVertex *nt = edge.NT();
|
||||
if (phrase.GetSize() && !phrase.GetWord(0).IsNonTerminal()) {
|
||||
lm::WordIndex index = Convert(phrase.GetWord(0));
|
||||
if (context_.LanguageModel().GetVocabulary().BeginSentence() == index) {
|
||||
bos = true;
|
||||
} else {
|
||||
words.push_back(index);
|
||||
}
|
||||
i = 1;
|
||||
}
|
||||
for (; i < phrase.GetSize(); ++i) {
|
||||
const Word &word = phrase.GetWord(i);
|
||||
if (word.IsNonTerminal()) {
|
||||
*(nt++) = vertices[align[i]];
|
||||
words.push_back(search::kNonTerminal);
|
||||
} else {
|
||||
words.push_back(Convert(word));
|
||||
}
|
||||
}
|
||||
|
||||
edge.SetScore(phrase.GetFutureScore() + below_score);
|
||||
search::ScoreRule(context_, words, bos, edge.Between());
|
||||
|
||||
search::Note note;
|
||||
note.vp = &phrase;
|
||||
edge.SetNote(note);
|
||||
|
||||
edges_.AddEdge(edge);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &) {
|
||||
std::vector<lm::WordIndex> words;
|
||||
CHECK(phrase.GetSize() <= 1);
|
||||
if (phrase.GetSize())
|
||||
words.push_back(Convert(phrase.GetWord(0)));
|
||||
|
||||
search::PartialEdge edge(edges_.AllocateEdge(0));
|
||||
// Appears to be a bug that FutureScore does not already include language model.
|
||||
edge.SetScore(phrase.GetFutureScore() + search::ScoreRule(context_, words, false, edge.Between()));
|
||||
|
||||
search::Note note;
|
||||
note.vp = &phrase;
|
||||
edge.SetNote(note);
|
||||
|
||||
edges_.AddEdge(edge);
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Route hypotheses to separate vertices for each left hand side label, populating ChartCellLabelSet out.
|
||||
class HypothesisCallback {
|
||||
public:
|
||||
HypothesisCallback(search::ContextBase &context, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool)
|
||||
: context_(context), out_(out), vertex_pool_(vertex_pool) {}
|
||||
|
||||
void NewHypothesis(search::PartialEdge partial) {
|
||||
search::VertexGenerator *&entry = out_.FindOrInsert(static_cast<const TargetPhrase *>(partial.GetNote().vp)->GetTargetLHS()).incr_generator;
|
||||
if (!entry) {
|
||||
entry = generator_pool_.construct(context_, *vertex_pool_.construct());
|
||||
}
|
||||
entry->NewHypothesis(partial);
|
||||
}
|
||||
|
||||
void FinishedSearch() {
|
||||
for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) {
|
||||
ChartCellLabel::Stack &stack = i->second.MutableStack();
|
||||
stack.incr_generator->FinishedSearch();
|
||||
stack.incr = &stack.incr_generator->Generating();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
search::ContextBase &context_;
|
||||
|
||||
ChartCellLabelSet &out_;
|
||||
|
||||
boost::object_pool<search::Vertex> &vertex_pool_;
|
||||
boost::object_pool<search::VertexGenerator> generator_pool_;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <class Model> void Fill<Model>::Search(ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool) {
|
||||
HypothesisCallback callback(context_, out, vertex_pool);
|
||||
edges_.Search(context_, callback);
|
||||
}
|
||||
|
||||
// TODO: factors (but chart doesn't seem to support factors anyway).
|
||||
template <class Model> lm::WordIndex Fill<Model>::Convert(const Word &word) const {
|
||||
std::size_t factor = word.GetFactor(0)->GetId();
|
||||
return (factor >= vocab_mapping_.size() ? 0 : vocab_mapping_[factor]);
|
||||
}
|
||||
|
||||
template class Fill<lm::ngram::ProbingModel>;
|
||||
template class Fill<lm::ngram::RestProbingModel>;
|
||||
template class Fill<lm::ngram::TrieModel>;
|
||||
template class Fill<lm::ngram::QuantTrieModel>;
|
||||
template class Fill<lm::ngram::ArrayTrieModel>;
|
||||
template class Fill<lm::ngram::QuantArrayTrieModel>;
|
||||
|
||||
} // namespace Incremental
|
||||
} // namespace Moses
|
@ -1,54 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "moses/ChartParserCallback.h"
|
||||
#include "moses/StackVec.h"
|
||||
|
||||
#include "lm/word_index.hh"
|
||||
#include "search/edge_generator.hh"
|
||||
|
||||
#include <boost/pool/object_pool.hpp>
|
||||
|
||||
#include <list>
|
||||
#include <vector>
|
||||
|
||||
namespace search {
|
||||
template <class Model> class Context;
|
||||
class Vertex;
|
||||
} // namespace search
|
||||
|
||||
namespace Moses {
|
||||
class Word;
|
||||
class WordsRange;
|
||||
class TargetPhraseCollection;
|
||||
class WordsRange;
|
||||
class ChartCellLabelSet;
|
||||
class TargetPhrase;
|
||||
|
||||
namespace Incremental {
|
||||
|
||||
// Replacement for ChartTranslationOptionList
|
||||
// TODO: implement count and score thresholding.
|
||||
template <class Model> class Fill : public ChartParserCallback {
|
||||
public:
|
||||
Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping);
|
||||
|
||||
void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored);
|
||||
|
||||
void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
|
||||
|
||||
bool Empty() const { return edges_.Empty(); }
|
||||
|
||||
void Search(ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool);
|
||||
|
||||
private:
|
||||
lm::WordIndex Convert(const Word &word) const ;
|
||||
|
||||
search::Context<Model> &context_;
|
||||
|
||||
const std::vector<lm::WordIndex> &vocab_mapping_;
|
||||
|
||||
search::EdgeGenerator edges_;
|
||||
};
|
||||
|
||||
} // namespace Incremental
|
||||
} // namespace Moses
|
@ -1,35 +1,172 @@
|
||||
#include "Manager.h"
|
||||
|
||||
#include "Fill.h"
|
||||
|
||||
#include "moses/ChartCell.h"
|
||||
#include "moses/ChartParserCallback.h"
|
||||
#include "moses/TranslationSystem.h"
|
||||
#include "moses/StaticData.h"
|
||||
|
||||
#include "search/context.hh"
|
||||
#include "lm/model.hh"
|
||||
#include "search/applied.hh"
|
||||
#include "search/config.hh"
|
||||
#include "search/weights.hh"
|
||||
#include "search/context.hh"
|
||||
#include "search/edge_generator.hh"
|
||||
#include "search/rule.hh"
|
||||
#include "search/vertex_generator.hh"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
namespace Moses {
|
||||
namespace Incremental {
|
||||
|
||||
namespace {
|
||||
|
||||
// This is called by EdgeGenerator. Route hypotheses to separate vertices for
|
||||
// each left hand side label, populating ChartCellLabelSet out.
|
||||
template <class Best> class HypothesisCallback {
|
||||
private:
|
||||
typedef search::VertexGenerator<Best> Gen;
|
||||
public:
|
||||
HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool)
|
||||
: context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {}
|
||||
|
||||
void NewHypothesis(search::PartialEdge partial) {
|
||||
// Get the LHS, look it up in the output ChartCellLabel, and upcast it.
|
||||
// It's not part of the union because it would have been ugly to expose template types in ChartCellLabel.
|
||||
ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast<const TargetPhrase *>(partial.GetNote().vp)->GetTargetLHS());
|
||||
Gen *entry = static_cast<Gen*>(stack.incr_generator);
|
||||
if (!entry) {
|
||||
entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_);
|
||||
stack.incr_generator = entry;
|
||||
}
|
||||
entry->NewHypothesis(partial);
|
||||
}
|
||||
|
||||
void FinishedSearch() {
|
||||
for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) {
|
||||
ChartCellLabel::Stack &stack = i->second.MutableStack();
|
||||
Gen *gen = static_cast<Gen*>(stack.incr_generator);
|
||||
gen->FinishedSearch();
|
||||
stack.incr = &gen->Generating();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
search::ContextBase &context_;
|
||||
|
||||
Best &best_;
|
||||
|
||||
ChartCellLabelSet &out_;
|
||||
|
||||
boost::object_pool<search::Vertex> &vertex_pool_;
|
||||
boost::object_pool<Gen> generator_pool_;
|
||||
};
|
||||
|
||||
// This is called by the moses parser to collect hypotheses. It converts to my
|
||||
// edges (search::PartialEdge).
|
||||
template <class Model> class Fill : public ChartParserCallback {
|
||||
public:
|
||||
Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping, search::Score oov_weight)
|
||||
: context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {}
|
||||
|
||||
void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored);
|
||||
|
||||
void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
|
||||
|
||||
bool Empty() const { return edges_.Empty(); }
|
||||
|
||||
template <class Best> void Search(Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool) {
|
||||
HypothesisCallback<Best> callback(context_, best, out, vertex_pool);
|
||||
edges_.Search(context_, callback);
|
||||
}
|
||||
|
||||
private:
|
||||
lm::WordIndex Convert(const Word &word) const;
|
||||
|
||||
search::Context<Model> &context_;
|
||||
|
||||
const std::vector<lm::WordIndex> &vocab_mapping_;
|
||||
|
||||
search::EdgeGenerator edges_;
|
||||
|
||||
const search::Score oov_weight_;
|
||||
};
|
||||
|
||||
template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &) {
|
||||
std::vector<search::PartialVertex> vertices;
|
||||
vertices.reserve(nts.size());
|
||||
float below_score = 0.0;
|
||||
for (StackVec::const_iterator i(nts.begin()); i != nts.end(); ++i) {
|
||||
vertices.push_back((*i)->GetStack().incr->RootPartial());
|
||||
if (vertices.back().Empty()) return;
|
||||
below_score += vertices.back().Bound();
|
||||
}
|
||||
|
||||
std::vector<lm::WordIndex> words;
|
||||
for (TargetPhraseCollection::const_iterator p(targets.begin()); p != targets.end(); ++p) {
|
||||
words.clear();
|
||||
const TargetPhrase &phrase = **p;
|
||||
const AlignmentInfo::NonTermIndexMap &align = phrase.GetAlignNonTerm().GetNonTermIndexMap();
|
||||
search::PartialEdge edge(edges_.AllocateEdge(nts.size()));
|
||||
|
||||
search::PartialVertex *nt = edge.NT();
|
||||
for (size_t i = 0; i < phrase.GetSize(); ++i) {
|
||||
const Word &word = phrase.GetWord(i);
|
||||
if (word.IsNonTerminal()) {
|
||||
*(nt++) = vertices[align[i]];
|
||||
words.push_back(search::kNonTerminal);
|
||||
} else {
|
||||
words.push_back(Convert(word));
|
||||
}
|
||||
}
|
||||
|
||||
edge.SetScore(phrase.GetFutureScore() + below_score);
|
||||
// prob and oov were already accounted for.
|
||||
search::ScoreRule(context_.LanguageModel(), words, edge.Between());
|
||||
|
||||
search::Note note;
|
||||
note.vp = &phrase;
|
||||
edge.SetNote(note);
|
||||
|
||||
edges_.AddEdge(edge);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &) {
|
||||
std::vector<lm::WordIndex> words;
|
||||
CHECK(phrase.GetSize() <= 1);
|
||||
if (phrase.GetSize())
|
||||
words.push_back(Convert(phrase.GetWord(0)));
|
||||
|
||||
search::PartialEdge edge(edges_.AllocateEdge(0));
|
||||
// Appears to be a bug that FutureScore does not already include language model.
|
||||
search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between()));
|
||||
edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast<search::Score>(scored.oov) * oov_weight_);
|
||||
|
||||
search::Note note;
|
||||
note.vp = &phrase;
|
||||
edge.SetNote(note);
|
||||
|
||||
edges_.AddEdge(edge);
|
||||
}
|
||||
|
||||
// TODO: factors (but chart doesn't seem to support factors anyway).
|
||||
template <class Model> lm::WordIndex Fill<Model>::Convert(const Word &word) const {
|
||||
std::size_t factor = word.GetFactor(0)->GetId();
|
||||
return (factor >= vocab_mapping_.size() ? 0 : vocab_mapping_[factor]);
|
||||
}
|
||||
|
||||
struct ChartCellBaseFactory {
|
||||
ChartCellBase *operator()(size_t startPos, size_t endPos) const {
|
||||
return new ChartCellBase(startPos, endPos);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
Manager::Manager(const InputType &source, const TranslationSystem &system) :
|
||||
source_(source),
|
||||
system_(system),
|
||||
cells_(source, ChartCellBaseFactory()),
|
||||
parser_(source, system, cells_) {
|
||||
|
||||
}
|
||||
parser_(source, system, cells_) {}
|
||||
|
||||
Manager::~Manager() {
|
||||
system_.CleanUpAfterSentenceProcessing(source_);
|
||||
@ -37,7 +174,7 @@ Manager::~Manager() {
|
||||
|
||||
namespace {
|
||||
|
||||
void ConstructString(const search::Final final, std::ostringstream &stream) {
|
||||
void ConstructString(const search::Applied final, std::ostringstream &stream) {
|
||||
assert(final.Valid());
|
||||
const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(final.GetNote().vp);
|
||||
size_t child = 0;
|
||||
@ -53,9 +190,9 @@ void ConstructString(const search::Final final, std::ostringstream &stream) {
|
||||
}
|
||||
|
||||
void BestString(const ChartCellLabelSet &labels, std::string &out) {
|
||||
search::Final best;
|
||||
search::Applied best;
|
||||
for (ChartCellLabelSet::const_iterator i = labels.begin(); i != labels.end(); ++i) {
|
||||
const search::Final child(i->second.GetStack().incr->BestChild());
|
||||
const search::Applied child(i->second.GetStack().incr->BestChild());
|
||||
if (child.Valid() && (!best.Valid() || (child.GetScore() > best.GetScore()))) {
|
||||
best = child;
|
||||
}
|
||||
@ -82,24 +219,24 @@ void BestString(const ChartCellLabelSet &labels, std::string &out) {
|
||||
|
||||
template <class Model> void Manager::LMCallback(const Model &model, const std::vector<lm::WordIndex> &words) {
|
||||
const LanguageModel &abstract = **system_.GetLanguageModels().begin();
|
||||
search::Weights weights(
|
||||
abstract.GetWeight(),
|
||||
abstract.OOVFeatureEnabled() ? abstract.GetOOVWeight() : 0.0,
|
||||
system_.GetWeightWordPenalty());
|
||||
search::Config config(weights, StaticData::Instance().GetCubePruningPopLimit());
|
||||
const float oov_weight = abstract.OOVFeatureEnabled() ? abstract.GetOOVWeight() : 0.0;
|
||||
const StaticData &data = StaticData::Instance();
|
||||
search::Config config(abstract.GetWeight(), data.GetCubePruningPopLimit(), search::NBestConfig(data.GetNBestSize()));
|
||||
search::Context<Model> context(config, model);
|
||||
|
||||
size_t size = source_.GetSize();
|
||||
search::SingleBest best;
|
||||
|
||||
size_t size = source_.GetSize();
|
||||
boost::object_pool<search::Vertex> vertex_pool(std::max<size_t>(size * size / 2, 32));
|
||||
|
||||
|
||||
for (size_t width = 1; width <= size; ++width) {
|
||||
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
|
||||
size_t endPos = startPos + width - 1;
|
||||
WordsRange range(startPos, endPos);
|
||||
Fill<Model> filler(context, words);
|
||||
Fill<Model> filler(context, words, oov_weight);
|
||||
parser_.Create(range, filler);
|
||||
filler.Search(cells_.MutableBase(range).MutableTargetLabelSet(), vertex_pool);
|
||||
filler.Search(best, cells_.MutableBase(range).MutableTargetLabelSet(), vertex_pool);
|
||||
}
|
||||
}
|
||||
BestString(cells_.GetBase(WordsRange(0, source_.GetSize() - 1)).GetTargetLabelSet(), output_);
|
||||
|
@ -1,5 +1 @@
|
||||
fakelib search : weights.cc vertex.cc vertex_generator.cc edge_generator.cc rule.cc ../lm//kenlm ../util//kenutil /top//boost_system : : : <include>.. ;
|
||||
|
||||
import testing ;
|
||||
|
||||
unit-test weights_test : weights_test.cc search /top//boost_unit_test_framework ;
|
||||
fakelib search : edge_generator.cc nbest.cc rule.cc vertex.cc vertex_generator.cc ../lm//kenlm ../util//kenutil /top//boost_system : : : <include>.. ;
|
||||
|
86
search/applied.hh
Normal file
86
search/applied.hh
Normal file
@ -0,0 +1,86 @@
|
||||
#ifndef SEARCH_APPLIED__
|
||||
#define SEARCH_APPLIED__
|
||||
|
||||
#include "search/edge.hh"
|
||||
#include "search/header.hh"
|
||||
#include "util/pool.hh"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
namespace search {
|
||||
|
||||
// A full hypothesis: a score, arity of the rule, a pointer to the decoder's rule (Note), and pointers to non-terminals that were substituted.
|
||||
template <class Below> class GenericApplied : public Header {
|
||||
public:
|
||||
GenericApplied() {}
|
||||
|
||||
GenericApplied(void *location, PartialEdge partial)
|
||||
: Header(location) {
|
||||
memcpy(Base(), partial.Base(), kHeaderSize);
|
||||
Below *child_out = Children();
|
||||
const PartialVertex *part = partial.NT();
|
||||
const PartialVertex *const part_end_loop = part + partial.GetArity();
|
||||
for (; part != part_end_loop; ++part, ++child_out)
|
||||
*child_out = Below(part->End());
|
||||
}
|
||||
|
||||
GenericApplied(void *location, Score score, Arity arity, Note note) : Header(location, arity) {
|
||||
SetScore(score);
|
||||
SetNote(note);
|
||||
}
|
||||
|
||||
explicit GenericApplied(History from) : Header(from) {}
|
||||
|
||||
|
||||
// These are arrays of length GetArity().
|
||||
Below *Children() {
|
||||
return reinterpret_cast<Below*>(After());
|
||||
}
|
||||
const Below *Children() const {
|
||||
return reinterpret_cast<const Below*>(After());
|
||||
}
|
||||
|
||||
static std::size_t Size(Arity arity) {
|
||||
return kHeaderSize + arity * sizeof(const Below);
|
||||
}
|
||||
};
|
||||
|
||||
// Applied rule that references itself.
|
||||
class Applied : public GenericApplied<Applied> {
|
||||
private:
|
||||
typedef GenericApplied<Applied> P;
|
||||
|
||||
public:
|
||||
Applied() {}
|
||||
Applied(void *location, PartialEdge partial) : P(location, partial) {}
|
||||
Applied(History from) : P(from) {}
|
||||
};
|
||||
|
||||
// How to build single-best hypotheses.
|
||||
class SingleBest {
|
||||
public:
|
||||
typedef PartialEdge Combine;
|
||||
|
||||
void Add(PartialEdge &existing, PartialEdge add) const {
|
||||
if (!existing.Valid() || existing.GetScore() < add.GetScore())
|
||||
existing = add;
|
||||
}
|
||||
|
||||
NBestComplete Complete(PartialEdge partial) {
|
||||
if (!partial.Valid())
|
||||
return NBestComplete(NULL, lm::ngram::ChartState(), -INFINITY);
|
||||
void *place_final = pool_.Allocate(Applied::Size(partial.GetArity()));
|
||||
Applied(place_final, partial);
|
||||
return NBestComplete(
|
||||
place_final,
|
||||
partial.CompletedState(),
|
||||
partial.GetScore());
|
||||
}
|
||||
|
||||
private:
|
||||
util::Pool pool_;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
||||
#endif // SEARCH_APPLIED__
|
@ -1,23 +1,36 @@
|
||||
#ifndef SEARCH_CONFIG__
|
||||
#define SEARCH_CONFIG__
|
||||
|
||||
#include "search/weights.hh"
|
||||
#include "util/string_piece.hh"
|
||||
#include "search/types.hh"
|
||||
|
||||
namespace search {
|
||||
|
||||
struct NBestConfig {
|
||||
explicit NBestConfig(unsigned int in_size) {
|
||||
keep = in_size;
|
||||
size = in_size;
|
||||
}
|
||||
|
||||
unsigned int keep, size;
|
||||
};
|
||||
|
||||
class Config {
|
||||
public:
|
||||
Config(const Weights &weights, unsigned int pop_limit) :
|
||||
weights_(weights), pop_limit_(pop_limit) {}
|
||||
Config(Score lm_weight, unsigned int pop_limit, const NBestConfig &nbest) :
|
||||
lm_weight_(lm_weight), pop_limit_(pop_limit), nbest_(nbest) {}
|
||||
|
||||
const Weights &GetWeights() const { return weights_; }
|
||||
Score LMWeight() const { return lm_weight_; }
|
||||
|
||||
unsigned int PopLimit() const { return pop_limit_; }
|
||||
|
||||
const NBestConfig &GetNBest() const { return nbest_; }
|
||||
|
||||
private:
|
||||
Weights weights_;
|
||||
Score lm_weight_;
|
||||
|
||||
unsigned int pop_limit_;
|
||||
|
||||
NBestConfig nbest_;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
@ -1,30 +1,16 @@
|
||||
#ifndef SEARCH_CONTEXT__
|
||||
#define SEARCH_CONTEXT__
|
||||
|
||||
#include "lm/model.hh"
|
||||
#include "search/config.hh"
|
||||
#include "search/final.hh"
|
||||
#include "search/types.hh"
|
||||
#include "search/vertex.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/pool.hh"
|
||||
|
||||
#include <boost/pool/object_pool.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace search {
|
||||
|
||||
class Weights;
|
||||
|
||||
class ContextBase {
|
||||
public:
|
||||
explicit ContextBase(const Config &config) : pop_limit_(config.PopLimit()), weights_(config.GetWeights()) {}
|
||||
|
||||
util::Pool &FinalPool() {
|
||||
return final_pool_;
|
||||
}
|
||||
explicit ContextBase(const Config &config) : config_(config) {}
|
||||
|
||||
VertexNode *NewVertexNode() {
|
||||
VertexNode *ret = vertex_node_pool_.construct();
|
||||
@ -36,18 +22,16 @@ class ContextBase {
|
||||
vertex_node_pool_.destroy(node);
|
||||
}
|
||||
|
||||
unsigned int PopLimit() const { return pop_limit_; }
|
||||
unsigned int PopLimit() const { return config_.PopLimit(); }
|
||||
|
||||
const Weights &GetWeights() const { return weights_; }
|
||||
Score LMWeight() const { return config_.LMWeight(); }
|
||||
|
||||
const Config &GetConfig() const { return config_; }
|
||||
|
||||
private:
|
||||
util::Pool final_pool_;
|
||||
|
||||
boost::object_pool<VertexNode> vertex_node_pool_;
|
||||
|
||||
unsigned int pop_limit_;
|
||||
|
||||
const Weights &weights_;
|
||||
Config config_;
|
||||
};
|
||||
|
||||
template <class Model> class Context : public ContextBase {
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "search/edge_generator.hh"
|
||||
|
||||
#include "lm/left.hh"
|
||||
#include "lm/model.hh"
|
||||
#include "lm/partial.hh"
|
||||
#include "search/context.hh"
|
||||
#include "search/vertex.hh"
|
||||
@ -38,7 +39,7 @@ template <class Model> void FastScore(const Context<Model> &context, Arity victi
|
||||
*cover = *(cover + 1);
|
||||
}
|
||||
}
|
||||
update.SetScore(update.GetScore() + adjustment * context.GetWeights().LM());
|
||||
update.SetScore(update.GetScore() + adjustment * context.LMWeight());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -2,7 +2,6 @@
|
||||
#define SEARCH_EDGE_GENERATOR__
|
||||
|
||||
#include "search/edge.hh"
|
||||
#include "search/note.hh"
|
||||
#include "search/types.hh"
|
||||
|
||||
#include <queue>
|
||||
|
@ -1,36 +0,0 @@
|
||||
#ifndef SEARCH_FINAL__
|
||||
#define SEARCH_FINAL__
|
||||
|
||||
#include "search/header.hh"
|
||||
#include "util/pool.hh"
|
||||
|
||||
namespace search {
|
||||
|
||||
// A full hypothesis with pointers to children.
|
||||
class Final : public Header {
|
||||
public:
|
||||
Final() {}
|
||||
|
||||
Final(util::Pool &pool, Score score, Arity arity, Note note)
|
||||
: Header(pool.Allocate(Size(arity)), arity) {
|
||||
SetScore(score);
|
||||
SetNote(note);
|
||||
}
|
||||
|
||||
// These are arrays of length GetArity().
|
||||
Final *Children() {
|
||||
return reinterpret_cast<Final*>(After());
|
||||
}
|
||||
const Final *Children() const {
|
||||
return reinterpret_cast<const Final*>(After());
|
||||
}
|
||||
|
||||
private:
|
||||
static std::size_t Size(Arity arity) {
|
||||
return kHeaderSize + arity * sizeof(const Final);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
||||
#endif // SEARCH_FINAL__
|
@ -3,7 +3,6 @@
|
||||
|
||||
// Header consisting of Score, Arity, and Note
|
||||
|
||||
#include "search/note.hh"
|
||||
#include "search/types.hh"
|
||||
|
||||
#include <stdint.h>
|
||||
@ -24,6 +23,9 @@ class Header {
|
||||
bool operator<(const Header &other) const {
|
||||
return GetScore() < other.GetScore();
|
||||
}
|
||||
bool operator>(const Header &other) const {
|
||||
return GetScore() > other.GetScore();
|
||||
}
|
||||
|
||||
Arity GetArity() const {
|
||||
return *reinterpret_cast<const Arity*>(base_ + sizeof(Score));
|
||||
@ -36,9 +38,14 @@ class Header {
|
||||
*reinterpret_cast<Note*>(base_ + sizeof(Score) + sizeof(Arity)) = to;
|
||||
}
|
||||
|
||||
uint8_t *Base() { return base_; }
|
||||
const uint8_t *Base() const { return base_; }
|
||||
|
||||
protected:
|
||||
Header() : base_(NULL) {}
|
||||
|
||||
explicit Header(void *base) : base_(static_cast<uint8_t*>(base)) {}
|
||||
|
||||
Header(void *base, Arity arity) : base_(static_cast<uint8_t*>(base)) {
|
||||
*reinterpret_cast<Arity*>(base_ + sizeof(Score)) = arity;
|
||||
}
|
||||
|
106
search/nbest.cc
Normal file
106
search/nbest.cc
Normal file
@ -0,0 +1,106 @@
|
||||
#include "search/nbest.hh"
|
||||
|
||||
#include "util/pool.hh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <queue>
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
namespace search {
|
||||
|
||||
NBestList::NBestList(std::vector<PartialEdge> &partials, util::Pool &entry_pool, std::size_t keep) {
|
||||
assert(!partials.empty());
|
||||
std::vector<PartialEdge>::iterator end;
|
||||
if (partials.size() > keep) {
|
||||
end = partials.begin() + keep;
|
||||
std::nth_element(partials.begin(), end, partials.end(), std::greater<PartialEdge>());
|
||||
} else {
|
||||
end = partials.end();
|
||||
}
|
||||
for (std::vector<PartialEdge>::const_iterator i(partials.begin()); i != end; ++i) {
|
||||
queue_.push(QueueEntry(entry_pool.Allocate(QueueEntry::Size(i->GetArity())), *i));
|
||||
}
|
||||
}
|
||||
|
||||
Score NBestList::TopAfterConstructor() const {
|
||||
assert(revealed_.empty());
|
||||
return queue_.top().GetScore();
|
||||
}
|
||||
|
||||
const std::vector<Applied> &NBestList::Extract(util::Pool &pool, std::size_t n) {
|
||||
while (revealed_.size() < n && !queue_.empty()) {
|
||||
MoveTop(pool);
|
||||
}
|
||||
return revealed_;
|
||||
}
|
||||
|
||||
Score NBestList::Visit(util::Pool &pool, std::size_t index) {
|
||||
if (index + 1 < revealed_.size())
|
||||
return revealed_[index + 1].GetScore() - revealed_[index].GetScore();
|
||||
if (queue_.empty())
|
||||
return -INFINITY;
|
||||
if (index + 1 == revealed_.size())
|
||||
return queue_.top().GetScore() - revealed_[index].GetScore();
|
||||
assert(index == revealed_.size());
|
||||
|
||||
MoveTop(pool);
|
||||
|
||||
if (queue_.empty()) return -INFINITY;
|
||||
return queue_.top().GetScore() - revealed_[index].GetScore();
|
||||
}
|
||||
|
||||
Applied NBestList::Get(util::Pool &pool, std::size_t index) {
|
||||
assert(index <= revealed_.size());
|
||||
if (index == revealed_.size()) MoveTop(pool);
|
||||
return revealed_[index];
|
||||
}
|
||||
|
||||
void NBestList::MoveTop(util::Pool &pool) {
|
||||
assert(!queue_.empty());
|
||||
QueueEntry entry(queue_.top());
|
||||
queue_.pop();
|
||||
RevealedRef *const children_begin = entry.Children();
|
||||
RevealedRef *const children_end = children_begin + entry.GetArity();
|
||||
Score basis = entry.GetScore();
|
||||
for (RevealedRef *child = children_begin; child != children_end; ++child) {
|
||||
Score change = child->in_->Visit(pool, child->index_);
|
||||
if (change != -INFINITY) {
|
||||
assert(change < 0.001);
|
||||
QueueEntry new_entry(pool.Allocate(QueueEntry::Size(entry.GetArity())), basis + change, entry.GetArity(), entry.GetNote());
|
||||
std::copy(children_begin, child, new_entry.Children());
|
||||
RevealedRef *update = new_entry.Children() + (child - children_begin);
|
||||
update->in_ = child->in_;
|
||||
update->index_ = child->index_ + 1;
|
||||
std::copy(child + 1, children_end, update + 1);
|
||||
queue_.push(new_entry);
|
||||
}
|
||||
// Gesmundo, A. and Henderson, J. Faster Cube Pruning, IWSLT 2010.
|
||||
if (child->index_) break;
|
||||
}
|
||||
|
||||
// Convert QueueEntry to Applied. This leaves some unused memory.
|
||||
void *overwrite = entry.Children();
|
||||
for (unsigned int i = 0; i < entry.GetArity(); ++i) {
|
||||
RevealedRef from(*(static_cast<const RevealedRef*>(overwrite) + i));
|
||||
*(static_cast<Applied*>(overwrite) + i) = from.in_->Get(pool, from.index_);
|
||||
}
|
||||
revealed_.push_back(Applied(entry.Base()));
|
||||
}
|
||||
|
||||
NBestComplete NBest::Complete(std::vector<PartialEdge> &partials) {
|
||||
assert(!partials.empty());
|
||||
NBestList *list = list_pool_.construct(partials, entry_pool_, config_.keep);
|
||||
return NBestComplete(
|
||||
list,
|
||||
partials.front().CompletedState(), // All partials have the same state
|
||||
list->TopAfterConstructor());
|
||||
}
|
||||
|
||||
const std::vector<Applied> &NBest::Extract(History history) {
|
||||
return static_cast<NBestList*>(history)->Extract(entry_pool_, config_.size);
|
||||
}
|
||||
|
||||
} // namespace search
|
81
search/nbest.hh
Normal file
81
search/nbest.hh
Normal file
@ -0,0 +1,81 @@
|
||||
#ifndef SEARCH_NBEST__
|
||||
#define SEARCH_NBEST__
|
||||
|
||||
#include "search/applied.hh"
|
||||
#include "search/config.hh"
|
||||
#include "search/edge.hh"
|
||||
|
||||
#include <boost/pool/object_pool.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
namespace search {
|
||||
|
||||
class NBestList;
|
||||
|
||||
class NBestList {
|
||||
private:
|
||||
class RevealedRef {
|
||||
public:
|
||||
explicit RevealedRef(History history)
|
||||
: in_(static_cast<NBestList*>(history)), index_(0) {}
|
||||
|
||||
private:
|
||||
friend class NBestList;
|
||||
|
||||
NBestList *in_;
|
||||
std::size_t index_;
|
||||
};
|
||||
|
||||
typedef GenericApplied<RevealedRef> QueueEntry;
|
||||
|
||||
public:
|
||||
NBestList(std::vector<PartialEdge> &existing, util::Pool &entry_pool, std::size_t keep);
|
||||
|
||||
Score TopAfterConstructor() const;
|
||||
|
||||
const std::vector<Applied> &Extract(util::Pool &pool, std::size_t n);
|
||||
|
||||
private:
|
||||
Score Visit(util::Pool &pool, std::size_t index);
|
||||
|
||||
Applied Get(util::Pool &pool, std::size_t index);
|
||||
|
||||
void MoveTop(util::Pool &pool);
|
||||
|
||||
typedef std::vector<Applied> Revealed;
|
||||
Revealed revealed_;
|
||||
|
||||
typedef std::priority_queue<QueueEntry> Queue;
|
||||
Queue queue_;
|
||||
};
|
||||
|
||||
class NBest {
|
||||
public:
|
||||
typedef std::vector<PartialEdge> Combine;
|
||||
|
||||
explicit NBest(const NBestConfig &config) : config_(config) {}
|
||||
|
||||
void Add(std::vector<PartialEdge> &existing, PartialEdge addition) const {
|
||||
existing.push_back(addition);
|
||||
}
|
||||
|
||||
NBestComplete Complete(std::vector<PartialEdge> &partials);
|
||||
|
||||
const std::vector<Applied> &Extract(History root);
|
||||
|
||||
private:
|
||||
const NBestConfig config_;
|
||||
|
||||
boost::object_pool<NBestList> list_pool_;
|
||||
|
||||
util::Pool entry_pool_;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
||||
#endif // SEARCH_NBEST__
|
@ -1,12 +0,0 @@
|
||||
#ifndef SEARCH_NOTE__
|
||||
#define SEARCH_NOTE__
|
||||
|
||||
namespace search {
|
||||
|
||||
union Note {
|
||||
const void *vp;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
||||
#endif // SEARCH_NOTE__
|
@ -1,7 +1,7 @@
|
||||
#include "search/rule.hh"
|
||||
|
||||
#include "lm/model.hh"
|
||||
#include "search/context.hh"
|
||||
#include "search/final.hh"
|
||||
|
||||
#include <ostream>
|
||||
|
||||
@ -9,35 +9,35 @@
|
||||
|
||||
namespace search {
|
||||
|
||||
template <class Model> float ScoreRule(const Context<Model> &context, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing) {
|
||||
unsigned int oov_count = 0;
|
||||
float prob = 0.0;
|
||||
const Model &model = context.LanguageModel();
|
||||
const lm::WordIndex oov = model.GetVocabulary().NotFound();
|
||||
for (std::vector<lm::WordIndex>::const_iterator word = words.begin(); ; ++word) {
|
||||
lm::ngram::RuleScore<Model> scorer(model, *(writing++));
|
||||
// TODO: optimize
|
||||
if (prepend_bos && (word == words.begin())) {
|
||||
scorer.BeginSentence();
|
||||
}
|
||||
for (; ; ++word) {
|
||||
if (word == words.end()) {
|
||||
prob += scorer.Finish();
|
||||
return static_cast<float>(oov_count) * context.GetWeights().OOV() + prob * context.GetWeights().LM();
|
||||
}
|
||||
if (*word == kNonTerminal) break;
|
||||
if (*word == oov) ++oov_count;
|
||||
template <class Model> ScoreRuleRet ScoreRule(const Model &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing) {
|
||||
ScoreRuleRet ret;
|
||||
ret.prob = 0.0;
|
||||
ret.oov = 0;
|
||||
const lm::WordIndex oov = model.GetVocabulary().NotFound(), bos = model.GetVocabulary().BeginSentence();
|
||||
lm::ngram::RuleScore<Model> scorer(model, *(writing++));
|
||||
std::vector<lm::WordIndex>::const_iterator word = words.begin();
|
||||
if (word != words.end() && *word == bos) {
|
||||
scorer.BeginSentence();
|
||||
++word;
|
||||
}
|
||||
for (; word != words.end(); ++word) {
|
||||
if (*word == kNonTerminal) {
|
||||
ret.prob += scorer.Finish();
|
||||
scorer.Reset(*(writing++));
|
||||
} else {
|
||||
if (*word == oov) ++ret.oov;
|
||||
scorer.Terminal(*word);
|
||||
}
|
||||
prob += scorer.Finish();
|
||||
}
|
||||
ret.prob += scorer.Finish();
|
||||
return ret;
|
||||
}
|
||||
|
||||
template float ScoreRule(const Context<lm::ngram::RestProbingModel> &model, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing);
|
||||
template float ScoreRule(const Context<lm::ngram::ProbingModel> &model, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing);
|
||||
template float ScoreRule(const Context<lm::ngram::TrieModel> &model, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing);
|
||||
template float ScoreRule(const Context<lm::ngram::QuantTrieModel> &model, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing);
|
||||
template float ScoreRule(const Context<lm::ngram::ArrayTrieModel> &model, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing);
|
||||
template float ScoreRule(const Context<lm::ngram::QuantArrayTrieModel> &model, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *writing);
|
||||
template ScoreRuleRet ScoreRule(const lm::ngram::RestProbingModel &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing);
|
||||
template ScoreRuleRet ScoreRule(const lm::ngram::ProbingModel &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing);
|
||||
template ScoreRuleRet ScoreRule(const lm::ngram::TrieModel &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing);
|
||||
template ScoreRuleRet ScoreRule(const lm::ngram::QuantTrieModel &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing);
|
||||
template ScoreRuleRet ScoreRule(const lm::ngram::ArrayTrieModel &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing);
|
||||
template ScoreRuleRet ScoreRule(const lm::ngram::QuantArrayTrieModel &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *writing);
|
||||
|
||||
} // namespace search
|
||||
|
@ -9,11 +9,16 @@
|
||||
|
||||
namespace search {
|
||||
|
||||
template <class Model> class Context;
|
||||
|
||||
const lm::WordIndex kNonTerminal = lm::kMaxWordIndex;
|
||||
|
||||
template <class Model> float ScoreRule(const Context<Model> &context, const std::vector<lm::WordIndex> &words, bool prepend_bos, lm::ngram::ChartState *state_out);
|
||||
struct ScoreRuleRet {
|
||||
Score prob;
|
||||
unsigned int oov;
|
||||
};
|
||||
|
||||
// Pass <s> and </s> normally.
|
||||
// Indicate non-terminals with kNonTerminal.
|
||||
template <class Model> ScoreRuleRet ScoreRule(const Model &model, const std::vector<lm::WordIndex> &words, lm::ngram::ChartState *state_out);
|
||||
|
||||
} // namespace search
|
||||
|
||||
|
@ -3,12 +3,29 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace lm { namespace ngram { class ChartState; } }
|
||||
|
||||
namespace search {
|
||||
|
||||
typedef float Score;
|
||||
|
||||
typedef uint32_t Arity;
|
||||
|
||||
union Note {
|
||||
const void *vp;
|
||||
};
|
||||
|
||||
typedef void *History;
|
||||
|
||||
struct NBestComplete {
|
||||
NBestComplete(History in_history, const lm::ngram::ChartState &in_state, Score in_score)
|
||||
: history(in_history), state(&in_state), score(in_score) {}
|
||||
|
||||
History history;
|
||||
const lm::ngram::ChartState *state;
|
||||
Score score;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
||||
#endif // SEARCH_TYPES__
|
||||
|
@ -19,21 +19,34 @@ struct GreaterByBound : public std::binary_function<const VertexNode *, const Ve
|
||||
|
||||
} // namespace
|
||||
|
||||
void VertexNode::SortAndSet(ContextBase &context, VertexNode **parent_ptr) {
|
||||
void VertexNode::RecursiveSortAndSet(ContextBase &context, VertexNode *&parent_ptr) {
|
||||
if (Complete()) {
|
||||
assert(end_.Valid());
|
||||
assert(end_);
|
||||
assert(extend_.empty());
|
||||
bound_ = end_.GetScore();
|
||||
return;
|
||||
}
|
||||
if (extend_.size() == 1 && parent_ptr) {
|
||||
*parent_ptr = extend_[0];
|
||||
extend_[0]->SortAndSet(context, parent_ptr);
|
||||
if (extend_.size() == 1) {
|
||||
parent_ptr = extend_[0];
|
||||
extend_[0]->RecursiveSortAndSet(context, parent_ptr);
|
||||
context.DeleteVertexNode(this);
|
||||
return;
|
||||
}
|
||||
for (std::vector<VertexNode*>::iterator i = extend_.begin(); i != extend_.end(); ++i) {
|
||||
(*i)->SortAndSet(context, &*i);
|
||||
(*i)->RecursiveSortAndSet(context, *i);
|
||||
}
|
||||
std::sort(extend_.begin(), extend_.end(), GreaterByBound());
|
||||
bound_ = extend_.front()->Bound();
|
||||
}
|
||||
|
||||
void VertexNode::SortAndSet(ContextBase &context) {
|
||||
// This is the root. The root might be empty.
|
||||
if (extend_.empty()) {
|
||||
bound_ = -INFINITY;
|
||||
return;
|
||||
}
|
||||
// The root cannot be replaced. There's always one transition.
|
||||
for (std::vector<VertexNode*>::iterator i = extend_.begin(); i != extend_.end(); ++i) {
|
||||
(*i)->RecursiveSortAndSet(context, *i);
|
||||
}
|
||||
std::sort(extend_.begin(), extend_.end(), GreaterByBound());
|
||||
bound_ = extend_.front()->Bound();
|
||||
|
@ -2,7 +2,6 @@
|
||||
#define SEARCH_VERTEX__
|
||||
|
||||
#include "lm/left.hh"
|
||||
#include "search/final.hh"
|
||||
#include "search/types.hh"
|
||||
|
||||
#include <boost/unordered_set.hpp>
|
||||
@ -10,6 +9,7 @@
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace search {
|
||||
@ -18,7 +18,7 @@ class ContextBase;
|
||||
|
||||
class VertexNode {
|
||||
public:
|
||||
VertexNode() {}
|
||||
VertexNode() : end_() {}
|
||||
|
||||
void InitRoot() {
|
||||
extend_.clear();
|
||||
@ -26,7 +26,7 @@ class VertexNode {
|
||||
state_.left.length = 0;
|
||||
state_.right.length = 0;
|
||||
right_full_ = false;
|
||||
end_ = Final();
|
||||
end_ = History();
|
||||
}
|
||||
|
||||
lm::ngram::ChartState &MutableState() { return state_; }
|
||||
@ -36,20 +36,21 @@ class VertexNode {
|
||||
extend_.push_back(next);
|
||||
}
|
||||
|
||||
void SetEnd(Final end) {
|
||||
assert(!end_.Valid());
|
||||
void SetEnd(History end, Score score) {
|
||||
assert(!end_);
|
||||
end_ = end;
|
||||
bound_ = score;
|
||||
}
|
||||
|
||||
void SortAndSet(ContextBase &context, VertexNode **parent_pointer);
|
||||
void SortAndSet(ContextBase &context);
|
||||
|
||||
// Should only happen to a root node when the entire vertex is empty.
|
||||
bool Empty() const {
|
||||
return !end_.Valid() && extend_.empty();
|
||||
return !end_ && extend_.empty();
|
||||
}
|
||||
|
||||
bool Complete() const {
|
||||
return end_.Valid();
|
||||
return end_;
|
||||
}
|
||||
|
||||
const lm::ngram::ChartState &State() const { return state_; }
|
||||
@ -64,7 +65,7 @@ class VertexNode {
|
||||
}
|
||||
|
||||
// Will be invalid unless this is a leaf.
|
||||
const Final End() const { return end_; }
|
||||
const History End() const { return end_; }
|
||||
|
||||
const VertexNode &operator[](size_t index) const {
|
||||
return *extend_[index];
|
||||
@ -75,13 +76,15 @@ class VertexNode {
|
||||
}
|
||||
|
||||
private:
|
||||
void RecursiveSortAndSet(ContextBase &context, VertexNode *&parent);
|
||||
|
||||
std::vector<VertexNode*> extend_;
|
||||
|
||||
lm::ngram::ChartState state_;
|
||||
bool right_full_;
|
||||
|
||||
Score bound_;
|
||||
Final end_;
|
||||
History end_;
|
||||
};
|
||||
|
||||
class PartialVertex {
|
||||
@ -97,7 +100,7 @@ class PartialVertex {
|
||||
const lm::ngram::ChartState &State() const { return back_->State(); }
|
||||
bool RightFull() const { return back_->RightFull(); }
|
||||
|
||||
Score Bound() const { return Complete() ? back_->End().GetScore() : (*back_)[index_].Bound(); }
|
||||
Score Bound() const { return Complete() ? back_->Bound() : (*back_)[index_].Bound(); }
|
||||
|
||||
unsigned char Length() const { return back_->Length(); }
|
||||
|
||||
@ -121,7 +124,7 @@ class PartialVertex {
|
||||
return ret;
|
||||
}
|
||||
|
||||
const Final End() const {
|
||||
const History End() const {
|
||||
return back_->End();
|
||||
}
|
||||
|
||||
@ -130,16 +133,18 @@ class PartialVertex {
|
||||
unsigned int index_;
|
||||
};
|
||||
|
||||
template <class Output> class VertexGenerator;
|
||||
|
||||
class Vertex {
|
||||
public:
|
||||
Vertex() {}
|
||||
|
||||
PartialVertex RootPartial() const { return PartialVertex(root_); }
|
||||
|
||||
const Final BestChild() const {
|
||||
const History BestChild() const {
|
||||
PartialVertex top(RootPartial());
|
||||
if (top.Empty()) {
|
||||
return Final();
|
||||
return History();
|
||||
} else {
|
||||
PartialVertex continuation;
|
||||
while (!top.Complete()) {
|
||||
@ -150,8 +155,8 @@ class Vertex {
|
||||
}
|
||||
|
||||
private:
|
||||
friend class VertexGenerator;
|
||||
|
||||
template <class Output> friend class VertexGenerator;
|
||||
template <class Output> friend class RootVertexGenerator;
|
||||
VertexNode root_;
|
||||
};
|
||||
|
||||
|
@ -11,23 +11,11 @@
|
||||
|
||||
namespace search {
|
||||
|
||||
VertexGenerator::VertexGenerator(ContextBase &context, Vertex &gen) : context_(context), gen_(gen) {
|
||||
gen.root_.InitRoot();
|
||||
}
|
||||
|
||||
#if BOOST_VERSION > 104200
|
||||
namespace {
|
||||
|
||||
const uint64_t kCompleteAdd = static_cast<uint64_t>(-1);
|
||||
|
||||
// Parallel structure to VertexNode.
|
||||
struct Trie {
|
||||
Trie() : under(NULL) {}
|
||||
|
||||
VertexNode *under;
|
||||
boost::unordered_map<uint64_t, Trie> extend;
|
||||
};
|
||||
|
||||
Trie &FindOrInsert(ContextBase &context, Trie &node, uint64_t added, const lm::ngram::ChartState &state, unsigned char left, bool left_full, unsigned char right, bool right_full) {
|
||||
Trie &next = node.extend[added];
|
||||
if (!next.under) {
|
||||
@ -43,19 +31,10 @@ Trie &FindOrInsert(ContextBase &context, Trie &node, uint64_t added, const lm::n
|
||||
return next;
|
||||
}
|
||||
|
||||
void CompleteTransition(ContextBase &context, Trie &starter, PartialEdge partial) {
|
||||
Final final(context.FinalPool(), partial.GetScore(), partial.GetArity(), partial.GetNote());
|
||||
Final *child_out = final.Children();
|
||||
const PartialVertex *part = partial.NT();
|
||||
const PartialVertex *const part_end_loop = part + partial.GetArity();
|
||||
for (; part != part_end_loop; ++part, ++child_out)
|
||||
*child_out = part->End();
|
||||
} // namespace
|
||||
|
||||
starter.under->SetEnd(final);
|
||||
}
|
||||
|
||||
void AddHypothesis(ContextBase &context, Trie &root, PartialEdge partial) {
|
||||
const lm::ngram::ChartState &state = partial.CompletedState();
|
||||
void AddHypothesis(ContextBase &context, Trie &root, const NBestComplete &end) {
|
||||
const lm::ngram::ChartState &state = *end.state;
|
||||
|
||||
unsigned char left = 0, right = 0;
|
||||
Trie *node = &root;
|
||||
@ -81,30 +60,9 @@ void AddHypothesis(ContextBase &context, Trie &root, PartialEdge partial) {
|
||||
}
|
||||
|
||||
node = &FindOrInsert(context, *node, kCompleteAdd - state.left.full, state, state.left.length, true, state.right.length, true);
|
||||
CompleteTransition(context, *node, partial);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#else // BOOST_VERSION
|
||||
|
||||
struct Trie {
|
||||
VertexNode *under;
|
||||
};
|
||||
|
||||
void AddHypothesis(ContextBase &context, Trie &root, PartialEdge partial) {
|
||||
UTIL_THROW(util::Exception, "Upgrade Boost to >= 1.42.0 to use incremental search.");
|
||||
node->under->SetEnd(end.history, end.score);
|
||||
}
|
||||
|
||||
#endif // BOOST_VERSION
|
||||
|
||||
void VertexGenerator::FinishedSearch() {
|
||||
Trie root;
|
||||
root.under = &gen_.root_;
|
||||
for (Existing::const_iterator i(existing_.begin()); i != existing_.end(); ++i) {
|
||||
AddHypothesis(context_, root, i->second);
|
||||
}
|
||||
root.under->SortAndSet(context_, NULL);
|
||||
}
|
||||
|
||||
} // namespace search
|
||||
|
@ -2,9 +2,11 @@
|
||||
#define SEARCH_VERTEX_GENERATOR__
|
||||
|
||||
#include "search/edge.hh"
|
||||
#include "search/types.hh"
|
||||
#include "search/vertex.hh"
|
||||
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/version.hpp>
|
||||
|
||||
namespace lm {
|
||||
namespace ngram {
|
||||
@ -15,21 +17,44 @@ class ChartState;
|
||||
namespace search {
|
||||
|
||||
class ContextBase;
|
||||
class Final;
|
||||
|
||||
class VertexGenerator {
|
||||
#if BOOST_VERSION > 104200
|
||||
// Parallel structure to VertexNode.
|
||||
struct Trie {
|
||||
Trie() : under(NULL) {}
|
||||
|
||||
VertexNode *under;
|
||||
boost::unordered_map<uint64_t, Trie> extend;
|
||||
};
|
||||
|
||||
void AddHypothesis(ContextBase &context, Trie &root, const NBestComplete &end);
|
||||
|
||||
#endif // BOOST_VERSION
|
||||
|
||||
// Output makes the single-best or n-best list.
|
||||
template <class Output> class VertexGenerator {
|
||||
public:
|
||||
VertexGenerator(ContextBase &context, Vertex &gen);
|
||||
|
||||
void NewHypothesis(PartialEdge partial) {
|
||||
const lm::ngram::ChartState &state = partial.CompletedState();
|
||||
std::pair<Existing::iterator, bool> ret(existing_.insert(std::make_pair(hash_value(state), partial)));
|
||||
if (!ret.second && ret.first->second < partial) {
|
||||
ret.first->second = partial;
|
||||
}
|
||||
VertexGenerator(ContextBase &context, Vertex &gen, Output &nbest) : context_(context), gen_(gen), nbest_(nbest) {
|
||||
gen.root_.InitRoot();
|
||||
}
|
||||
|
||||
void FinishedSearch();
|
||||
void NewHypothesis(PartialEdge partial) {
|
||||
nbest_.Add(existing_[hash_value(partial.CompletedState())], partial);
|
||||
}
|
||||
|
||||
void FinishedSearch() {
|
||||
#if BOOST_VERSION > 104200
|
||||
Trie root;
|
||||
root.under = &gen_.root_;
|
||||
for (typename Existing::iterator i(existing_.begin()); i != existing_.end(); ++i) {
|
||||
AddHypothesis(context_, root, nbest_.Complete(i->second));
|
||||
}
|
||||
existing_.clear();
|
||||
root.under->SortAndSet(context_);
|
||||
#else
|
||||
UTIL_THROW(util::Exception, "Upgrade Boost to >= 1.42.0 to use incremental search.");
|
||||
#endif
|
||||
}
|
||||
|
||||
const Vertex &Generating() const { return gen_; }
|
||||
|
||||
@ -38,8 +63,35 @@ class VertexGenerator {
|
||||
|
||||
Vertex &gen_;
|
||||
|
||||
typedef boost::unordered_map<uint64_t, PartialEdge> Existing;
|
||||
typedef boost::unordered_map<uint64_t, typename Output::Combine> Existing;
|
||||
Existing existing_;
|
||||
|
||||
Output &nbest_;
|
||||
};
|
||||
|
||||
// Special case for root vertex: everything should come together into the root
|
||||
// node. In theory, this should happen naturally due to state collapsing with
|
||||
// <s> and </s>. If that's the case, VertexGenerator is fine, though it will
|
||||
// make one connection.
|
||||
template <class Output> class RootVertexGenerator {
|
||||
public:
|
||||
RootVertexGenerator(Vertex &gen, Output &out) : gen_(gen), out_(out) {}
|
||||
|
||||
void NewHypothesis(PartialEdge partial) {
|
||||
out_.Add(combine_, partial);
|
||||
}
|
||||
|
||||
void FinishedSearch() {
|
||||
gen_.root_.InitRoot();
|
||||
NBestComplete completed(out_.Complete(combine_));
|
||||
gen_.root_.SetEnd(completed.history, completed.score);
|
||||
}
|
||||
|
||||
private:
|
||||
Vertex &gen_;
|
||||
|
||||
typename Output::Combine combine_;
|
||||
Output &out_;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
@ -1,71 +0,0 @@
|
||||
#include "search/weights.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
namespace search {
|
||||
|
||||
namespace {
|
||||
struct Insert {
|
||||
void operator()(boost::unordered_map<std::string, search::Score> &map, StringPiece name, search::Score score) const {
|
||||
std::string copy(name.data(), name.size());
|
||||
map[copy] = score;
|
||||
}
|
||||
};
|
||||
|
||||
struct DotProduct {
|
||||
search::Score total;
|
||||
DotProduct() : total(0.0) {}
|
||||
|
||||
void operator()(const boost::unordered_map<std::string, search::Score> &map, StringPiece name, search::Score score) {
|
||||
boost::unordered_map<std::string, search::Score>::const_iterator i(FindStringPiece(map, name));
|
||||
if (i != map.end())
|
||||
total += score * i->second;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Map, class Op> void Parse(StringPiece text, Map &map, Op &op) {
|
||||
for (util::TokenIter<util::SingleCharacter, true> spaces(text, ' '); spaces; ++spaces) {
|
||||
util::TokenIter<util::SingleCharacter> equals(*spaces, '=');
|
||||
UTIL_THROW_IF(!equals, WeightParseException, "Bad weight token " << *spaces);
|
||||
StringPiece name(*equals);
|
||||
UTIL_THROW_IF(!++equals, WeightParseException, "Bad weight token " << *spaces);
|
||||
char *end;
|
||||
// Assumes proper termination.
|
||||
double value = std::strtod(equals->data(), &end);
|
||||
UTIL_THROW_IF(end != equals->data() + equals->size(), WeightParseException, "Failed to parse weight" << *equals);
|
||||
UTIL_THROW_IF(++equals, WeightParseException, "Too many equals in " << *spaces);
|
||||
op(map, name, value);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Weights::Weights(StringPiece text) {
|
||||
Insert op;
|
||||
Parse<Map, Insert>(text, map_, op);
|
||||
lm_ = Steal("LanguageModel");
|
||||
oov_ = Steal("OOV");
|
||||
word_penalty_ = Steal("WordPenalty");
|
||||
}
|
||||
|
||||
Weights::Weights(Score lm, Score oov, Score word_penalty) : lm_(lm), oov_(oov), word_penalty_(word_penalty) {}
|
||||
|
||||
search::Score Weights::DotNoLM(StringPiece text) const {
|
||||
DotProduct dot;
|
||||
Parse<const Map, DotProduct>(text, map_, dot);
|
||||
return dot.total;
|
||||
}
|
||||
|
||||
float Weights::Steal(const std::string &str) {
|
||||
Map::iterator i(map_.find(str));
|
||||
if (i == map_.end()) {
|
||||
return 0.0;
|
||||
} else {
|
||||
float ret = i->second;
|
||||
map_.erase(i);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace search
|
@ -1,52 +0,0 @@
|
||||
// For now, the individual features are not kept.
|
||||
#ifndef SEARCH_WEIGHTS__
|
||||
#define SEARCH_WEIGHTS__
|
||||
|
||||
#include "search/types.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace search {
|
||||
|
||||
class WeightParseException : public util::Exception {
|
||||
public:
|
||||
WeightParseException() {}
|
||||
~WeightParseException() throw() {}
|
||||
};
|
||||
|
||||
class Weights {
|
||||
public:
|
||||
// Parses weights, sets lm_weight_, removes it from map_.
|
||||
explicit Weights(StringPiece text);
|
||||
|
||||
// Just the three scores we care about adding.
|
||||
Weights(Score lm, Score oov, Score word_penalty);
|
||||
|
||||
Score DotNoLM(StringPiece text) const;
|
||||
|
||||
Score LM() const { return lm_; }
|
||||
|
||||
Score OOV() const { return oov_; }
|
||||
|
||||
Score WordPenalty() const { return word_penalty_; }
|
||||
|
||||
// Mostly for testing.
|
||||
const boost::unordered_map<std::string, Score> &GetMap() const { return map_; }
|
||||
|
||||
private:
|
||||
float Steal(const std::string &str);
|
||||
|
||||
typedef boost::unordered_map<std::string, Score> Map;
|
||||
|
||||
Map map_;
|
||||
|
||||
Score lm_, oov_, word_penalty_;
|
||||
};
|
||||
|
||||
} // namespace search
|
||||
|
||||
#endif // SEARCH_WEIGHTS__
|
@ -1,38 +0,0 @@
|
||||
#include "search/weights.hh"
|
||||
|
||||
#define BOOST_TEST_MODULE WeightTest
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include <boost/test/floating_point_comparison.hpp>
|
||||
|
||||
namespace search {
|
||||
namespace {
|
||||
|
||||
#define CHECK_WEIGHT(value, string) \
|
||||
i = parsed.find(string); \
|
||||
BOOST_REQUIRE(i != parsed.end()); \
|
||||
BOOST_CHECK_CLOSE((value), i->second, 0.001);
|
||||
|
||||
BOOST_AUTO_TEST_CASE(parse) {
|
||||
// These are not real feature weights.
|
||||
Weights w("rarity=0 phrase-SGT=0 phrase-TGS=9.45117 lhsGrhs=0 lexical-SGT=2.33833 lexical-TGS=-28.3317 abstract?=0 LanguageModel=3 lexical?=1 glue?=5");
|
||||
const boost::unordered_map<std::string, search::Score> &parsed = w.GetMap();
|
||||
boost::unordered_map<std::string, search::Score>::const_iterator i;
|
||||
CHECK_WEIGHT(0.0, "rarity");
|
||||
CHECK_WEIGHT(0.0, "phrase-SGT");
|
||||
CHECK_WEIGHT(9.45117, "phrase-TGS");
|
||||
CHECK_WEIGHT(2.33833, "lexical-SGT");
|
||||
BOOST_CHECK(parsed.end() == parsed.find("lm"));
|
||||
BOOST_CHECK_CLOSE(3.0, w.LM(), 0.001);
|
||||
CHECK_WEIGHT(-28.3317, "lexical-TGS");
|
||||
CHECK_WEIGHT(5.0, "glue?");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(dot) {
|
||||
Weights w("rarity=0 phrase-SGT=0 phrase-TGS=9.45117 lhsGrhs=0 lexical-SGT=2.33833 lexical-TGS=-28.3317 abstract?=0 LanguageModel=3 lexical?=1 glue?=5");
|
||||
BOOST_CHECK_CLOSE(9.45117 * 3.0, w.DotNoLM("phrase-TGS=3.0"), 0.001);
|
||||
BOOST_CHECK_CLOSE(9.45117 * 3.0, w.DotNoLM("phrase-TGS=3.0 LanguageModel=10"), 0.001);
|
||||
BOOST_CHECK_CLOSE(9.45117 * 3.0 + 28.3317 * 17.4, w.DotNoLM("rarity=5 phrase-TGS=3.0 LanguageModel=10 lexical-TGS=-17.4"), 0.001);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace search
|
Loading…
Reference in New Issue
Block a user