moses/src for DPR_reordering model

Add files:
DPR_reordering.h/cpp
Modified files:
StaticData.h/cpp
Parameter.h/cpp

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/ni_DPR_reordering_model@2973 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
skyload 2010-03-14 21:55:52 +00:00
parent 88ead91273
commit 392aa0a89f
184 changed files with 26761 additions and 0 deletions

499
src/BitmapContainer.cpp Normal file
View File

@ -0,0 +1,499 @@
// $Id: BitmapContainer.cpp 2477 2009-08-07 16:47:54Z bhaddow $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <algorithm>
#include <limits>
#include <utility>
#include "BitmapContainer.h"
#include "HypothesisStackCubePruning.h"
#include "DummyScoreProducers.h"
#include "TranslationOptionList.h"
namespace Moses
{
class HypothesisScoreOrdererNoDistortion
{
public:
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const
{
const float scoreA = hypoA->GetScore();
const float scoreB = hypoB->GetScore();
if (scoreA > scoreB)
{
return true;
}
else if (scoreA < scoreB)
{
return false;
}
else
{
return hypoA < hypoB;
}
}
};
class HypothesisScoreOrdererWithDistortion
{
public:
HypothesisScoreOrdererWithDistortion(const WordsRange* transOptRange) :
m_transOptRange(transOptRange) {}
const WordsRange* m_transOptRange;
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const
{
assert (m_transOptRange != NULL);
const float weightDistortion = StaticData::Instance().GetWeightDistortion();
const DistortionScoreProducer *dsp = StaticData::Instance().GetDistortionScoreProducer();
const float distortionScoreA = dsp->CalculateDistortionScore(
*hypoA,
hypoA->GetCurrSourceWordsRange(),
*m_transOptRange,
hypoA->GetWordsBitmap().GetFirstGapPos()
);
const float distortionScoreB = dsp->CalculateDistortionScore(
*hypoB,
hypoB->GetCurrSourceWordsRange(),
*m_transOptRange,
hypoB->GetWordsBitmap().GetFirstGapPos()
);
const float scoreA = hypoA->GetScore() + distortionScoreA * weightDistortion;
const float scoreB = hypoB->GetScore() + distortionScoreB * weightDistortion;
if (scoreA > scoreB)
{
return true;
}
else if (scoreA < scoreB)
{
return false;
}
else
{
return hypoA < hypoB;
}
}
};
////////////////////////////////////////////////////////////////////////////////
// BackwardsEdge Code
////////////////////////////////////////////////////////////////////////////////
BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
, BitmapContainer &parent
, const TranslationOptionList &translations
, const SquareMatrix &futureScore,
const InputType& itype)
: m_initialized(false)
, m_prevBitmapContainer(prevBitmapContainer)
, m_parent(parent)
, m_translations(translations)
, m_futurescore(futureScore)
, m_seenPosition()
{
// If either dimension is empty, we haven't got anything to do.
if(m_prevBitmapContainer.GetHypotheses().size() == 0 || m_translations.size() == 0) {
VERBOSE(3, "Empty cube on BackwardsEdge" << std::endl);
return;
}
// Fetch the things we need for distortion cost computation.
int maxDistortion = StaticData::Instance().GetMaxDistortion();
if (maxDistortion == -1) {
for (HypothesisSet::const_iterator iter = m_prevBitmapContainer.GetHypotheses().begin(); iter != m_prevBitmapContainer.GetHypotheses().end(); ++iter)
{
m_hypotheses.push_back(*iter);
}
return;
}
const WordsRange &transOptRange = translations.Get(0)->GetSourceWordsRange();
HypothesisSet::const_iterator iterHypo = m_prevBitmapContainer.GetHypotheses().begin();
HypothesisSet::const_iterator iterEnd = m_prevBitmapContainer.GetHypotheses().end();
while (iterHypo != iterEnd)
{
const Hypothesis &hypo = **iterHypo;
// Special case: If this is the first hypothesis used to seed the search,
// it doesn't have a valid range, and we create the hypothesis, if the
// initial position is not further into the sentence than the distortion limit.
if (hypo.GetWordsBitmap().GetNumWordsCovered() == 0)
{
if (transOptRange.GetStartPos() <= maxDistortion)
m_hypotheses.push_back(&hypo);
}
else
{
int distortionDistance = itype.ComputeDistortionDistance(hypo.GetCurrSourceWordsRange()
, transOptRange);
if (distortionDistance <= maxDistortion)
m_hypotheses.push_back(&hypo);
}
++iterHypo;
}
if (m_translations.size() > 1)
{
assert(m_translations.Get(0)->GetFutureScore() >= m_translations.Get(1)->GetFutureScore());
}
if (m_hypotheses.size() > 1)
{
assert(m_hypotheses[0]->GetTotalScore() >= m_hypotheses[1]->GetTotalScore());
}
HypothesisScoreOrdererWithDistortion orderer (&transOptRange);
std::sort(m_hypotheses.begin(), m_hypotheses.end(), orderer);
// std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrdererNoDistortion());
}
BackwardsEdge::~BackwardsEdge()
{
m_seenPosition.clear();
m_hypotheses.clear();
}
void
BackwardsEdge::Initialize()
{
if(m_hypotheses.size() == 0 || m_translations.size() == 0)
{
m_initialized = true;
return;
}
Hypothesis *expanded = CreateHypothesis(*m_hypotheses[0], *m_translations.Get(0));
m_parent.Enqueue(0, 0, expanded, this);
SetSeenPosition(0, 0);
m_initialized = true;
}
Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt)
{
// create hypothesis and calculate all its scores
Hypothesis *newHypo = hypothesis.CreateNext(transOpt, NULL); // TODO FIXME This is absolutely broken - don't pass null here
// expand hypothesis further if transOpt was linked
std::vector<TranslationOption*>::const_iterator iterLinked = transOpt.GetLinkedTransOpts().begin();
std::vector<TranslationOption*>::const_iterator iterEnd = transOpt.GetLinkedTransOpts().end();
while (iterLinked != iterEnd)
{
const WordsBitmap hypoBitmap = newHypo->GetWordsBitmap();
if (hypoBitmap.Overlap((**iterLinked).GetSourceWordsRange())) {
// don't want to add a hypothesis that has some but not all of a linked TO set, so return
delete newHypo;
return NULL;
}
else
{
newHypo->CalcScore(m_futurescore);
newHypo = newHypo->CreateNext(**iterLinked, NULL); // TODO FIXME This is absolutely broken - don't pass null here
}
++iterLinked;
}
newHypo->CalcScore(m_futurescore);
return newHypo;
}
bool
BackwardsEdge::SeenPosition(const size_t x, const size_t y)
{
std::set< int >::iterator iter = m_seenPosition.find((x<<16) + y);
return (iter != m_seenPosition.end());
}
void
BackwardsEdge::SetSeenPosition(const size_t x, const size_t y)
{
assert(x < (1<<17));
assert(y < (1<<17));
m_seenPosition.insert((x<<16) + y);
}
bool
BackwardsEdge::GetInitialized()
{
return m_initialized;
}
const BitmapContainer&
BackwardsEdge::GetBitmapContainer() const
{
return m_prevBitmapContainer;
}
void
BackwardsEdge::PushSuccessors(const size_t x, const size_t y)
{
Hypothesis *newHypo;
if(y + 1 < m_translations.size() && !SeenPosition(x, y + 1)) {
SetSeenPosition(x, y + 1);
newHypo = CreateHypothesis(*m_hypotheses[x], *m_translations.Get(y + 1));
if(newHypo != NULL)
{
m_parent.Enqueue(x, y + 1, newHypo, (BackwardsEdge*)this);
}
}
if(x + 1 < m_hypotheses.size() && !SeenPosition(x + 1, y)) {
SetSeenPosition(x + 1, y);
newHypo = CreateHypothesis(*m_hypotheses[x + 1], *m_translations.Get(y));
if(newHypo != NULL)
{
m_parent.Enqueue(x + 1, y, newHypo, (BackwardsEdge*)this);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// BitmapContainer Code
////////////////////////////////////////////////////////////////////////////////
BitmapContainer::BitmapContainer(const WordsBitmap &bitmap
, HypothesisStackCubePruning &stack)
: m_bitmap(bitmap)
, m_stack(stack)
, m_numStackInsertions(0)
{
m_hypotheses = HypothesisSet();
m_edges = BackwardsEdgeSet();
m_queue = HypothesisQueue();
}
BitmapContainer::~BitmapContainer()
{
// As we have created the square position objects we clean up now.
HypothesisQueueItem *item = NULL;
while (!m_queue.empty())
{
item = m_queue.top();
FREEHYPO(item->GetHypothesis());
delete item;
m_queue.pop();
}
// Delete all edges.
RemoveAllInColl(m_edges);
m_hypotheses.clear();
m_edges.clear();
}
void
BitmapContainer::Enqueue(int hypothesis_pos
, int translation_pos
, Hypothesis *hypothesis
, BackwardsEdge *edge)
{
HypothesisQueueItem *item = new HypothesisQueueItem(hypothesis_pos
, translation_pos
, hypothesis
, edge);
m_queue.push(item);
}
HypothesisQueueItem*
BitmapContainer::Dequeue(bool keepValue)
{
if (!m_queue.empty())
{
HypothesisQueueItem *item = m_queue.top();
if (!keepValue)
{
m_queue.pop();
}
return item;
}
return NULL;
}
HypothesisQueueItem*
BitmapContainer::Top() const
{
return m_queue.top();
}
size_t
BitmapContainer::Size()
{
return m_queue.size();
}
bool
BitmapContainer::Empty() const
{
return m_queue.empty();
}
const WordsBitmap&
BitmapContainer::GetWordsBitmap()
{
return m_bitmap;
}
const HypothesisSet&
BitmapContainer::GetHypotheses() const
{
return m_hypotheses;
}
size_t
BitmapContainer::GetHypothesesSize() const
{
return m_hypotheses.size();
}
const BackwardsEdgeSet&
BitmapContainer::GetBackwardsEdges()
{
return m_edges;
}
void
BitmapContainer::AddHypothesis(Hypothesis *hypothesis)
{
bool itemExists = false;
HypothesisSet::const_iterator iter = m_hypotheses.begin();
HypothesisSet::const_iterator iterEnd = m_hypotheses.end();
// cfedermann: do we actually need this check?
while (iter != iterEnd)
{
if (*iter == hypothesis) {
itemExists = true;
break;
}
++iter;
}
assert(itemExists == false);
m_hypotheses.push_back(hypothesis);
}
void
BitmapContainer::AddBackwardsEdge(BackwardsEdge *edge)
{
m_edges.insert(edge);
}
void
BitmapContainer::InitializeEdges()
{
BackwardsEdgeSet::iterator iter = m_edges.begin();
BackwardsEdgeSet::iterator iterEnd = m_edges.end();
while (iter != iterEnd)
{
BackwardsEdge *edge = *iter;
edge->Initialize();
++iter;
}
}
void
BitmapContainer::EnsureMinStackHyps(const size_t minNumHyps)
{
while ((!Empty()) && m_numStackInsertions < minNumHyps)
{
ProcessBestHypothesis();
}
}
void
BitmapContainer::ProcessBestHypothesis()
{
if (m_queue.empty())
{
return;
}
// Get the currently best hypothesis from the queue.
HypothesisQueueItem *item = Dequeue();
// If the priority queue is exhausted, we are done and should have exited
assert(item != NULL);
// check we are pulling things off of priority queue in right order
if (!Empty())
{
HypothesisQueueItem *check = Dequeue(true);
assert(item->GetHypothesis()->GetTotalScore() >= check->GetHypothesis()->GetTotalScore());
}
// Logging for the criminally insane
IFVERBOSE(3) {
// const StaticData &staticData = StaticData::Instance();
item->GetHypothesis()->PrintHypothesis();
}
// Add best hypothesis to hypothesis stack.
const bool newstackentry = m_stack.AddPrune(item->GetHypothesis());
if (newstackentry)
m_numStackInsertions++;
IFVERBOSE(3) {
TRACE_ERR("new stack entry flag is " << newstackentry << std::endl);
}
// Create new hypotheses for the two successors of the hypothesis just added.
item->GetBackwardsEdge()->PushSuccessors(item->GetHypothesisPos(), item->GetTranslationPos());
// We are done with the queue item, we delete it.
delete item;
}
void
BitmapContainer::SortHypotheses()
{
std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrderer());
}
}

249
src/BitmapContainer.h Normal file
View File

@ -0,0 +1,249 @@
// $Id: BitmapContainer.h 2939 2010-02-24 11:15:44Z jfouet $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_BitmapContainer_h
#define moses_BitmapContainer_h
#include <set>
#include <vector>
#include "Hypothesis.h"
#include "HypothesisStackCubePruning.h"
#include "SquareMatrix.h"
#include "TranslationOption.h"
#include "TypeDef.h"
#include "WordsBitmap.h"
namespace Moses
{
class BitmapContainer;
class BackwardsEdge;
class Hypothesis;
class HypothesisStackCubePruning;
class HypothesisQueueItem;
class QueueItemOrderer;
typedef std::vector< Hypothesis* > HypothesisSet;
typedef std::set< BackwardsEdge* > BackwardsEdgeSet;
typedef std::priority_queue< HypothesisQueueItem*, std::vector< HypothesisQueueItem* >, QueueItemOrderer> HypothesisQueue;
////////////////////////////////////////////////////////////////////////////////
// Hypothesis Priority Queue Code
////////////////////////////////////////////////////////////////////////////////
class HypothesisQueueItem
{
private:
size_t m_hypothesis_pos, m_translation_pos;
Hypothesis *m_hypothesis;
BackwardsEdge *m_edge;
HypothesisQueueItem();
public:
HypothesisQueueItem(const size_t hypothesis_pos
, const size_t translation_pos
, Hypothesis *hypothesis
, BackwardsEdge *edge)
: m_hypothesis_pos(hypothesis_pos)
, m_translation_pos(translation_pos)
, m_hypothesis(hypothesis)
, m_edge(edge)
{
}
~HypothesisQueueItem()
{
}
int GetHypothesisPos()
{
return m_hypothesis_pos;
}
int GetTranslationPos()
{
return m_translation_pos;
}
Hypothesis *GetHypothesis()
{
return m_hypothesis;
}
BackwardsEdge *GetBackwardsEdge()
{
return m_edge;
}
};
// Allows to compare two HypothesisQueueItem objects by the corresponding scores.
class QueueItemOrderer
{
public:
bool operator()(HypothesisQueueItem* itemA, HypothesisQueueItem* itemB) const
{
float scoreA = itemA->GetHypothesis()->GetTotalScore();
float scoreB = itemB->GetHypothesis()->GetTotalScore();
return (scoreA < scoreB);
/*
{
return true;
}
else if (scoreA < scoreB)
{
return false;
}
else
{
return itemA < itemB;
}*/
}
};
////////////////////////////////////////////////////////////////////////////////
// Hypothesis Orderer Code
////////////////////////////////////////////////////////////////////////////////
// Allows to compare two Hypothesis objects by the corresponding scores.
////////////////////////////////////////////////////////////////////////////////
class HypothesisScoreOrderer
{
public:
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const
{
float scoreA = hypoA->GetTotalScore();
float scoreB = hypoB->GetTotalScore();
return (scoreA > scoreB);
/*
{
return true;
}
else if (scoreA < scoreB)
{
return false;
}
else
{
return hypoA < hypoB;
}*/
}
};
////////////////////////////////////////////////////////////////////////////////
// Backwards Edge Code
////////////////////////////////////////////////////////////////////////////////
// Encodes an edge pointing to a BitmapContainer.
////////////////////////////////////////////////////////////////////////////////
class BackwardsEdge
{
private:
friend class BitmapContainer;
bool m_initialized;
const BitmapContainer &m_prevBitmapContainer;
BitmapContainer &m_parent;
const TranslationOptionList &m_translations;
const SquareMatrix &m_futurescore;
std::vector< const Hypothesis* > m_hypotheses;
std::set< int > m_seenPosition;
// We don't want to instantiate "empty" objects.
BackwardsEdge();
Hypothesis *CreateHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt);
bool SeenPosition(const size_t x, const size_t y);
void SetSeenPosition(const size_t x, const size_t y);
protected:
void Initialize();
public:
BackwardsEdge(const BitmapContainer &prevBitmapContainer
, BitmapContainer &parent
, const TranslationOptionList &translations
, const SquareMatrix &futureScore,
const InputType& source);
~BackwardsEdge();
bool GetInitialized();
const BitmapContainer &GetBitmapContainer() const;
int GetDistortionPenalty();
void PushSuccessors(const size_t x, const size_t y);
};
////////////////////////////////////////////////////////////////////////////////
// Bitmap Container Code
////////////////////////////////////////////////////////////////////////////////
// A BitmapContainer encodes an ordered set of hypotheses and a set of edges
// pointing to the "generating" BitmapContainers. It also stores a priority
// queue that contains expanded hypotheses from the connected edges.
////////////////////////////////////////////////////////////////////////////////
class BitmapContainer
{
private:
WordsBitmap m_bitmap;
HypothesisStackCubePruning &m_stack;
HypothesisSet m_hypotheses;
BackwardsEdgeSet m_edges;
HypothesisQueue m_queue;
size_t m_numStackInsertions;
// We always require a corresponding bitmap to be supplied.
BitmapContainer();
BitmapContainer(const BitmapContainer &);
public:
BitmapContainer(const WordsBitmap &bitmap
, HypothesisStackCubePruning &stack);
// The destructor will also delete all the edges that are
// connected to this BitmapContainer.
~BitmapContainer();
void Enqueue(int hypothesis_pos, int translation_pos, Hypothesis *hypothesis, BackwardsEdge *edge);
HypothesisQueueItem *Dequeue(bool keepValue=false);
HypothesisQueueItem *Top() const;
size_t Size();
bool Empty() const;
const WordsBitmap &GetWordsBitmap();
const HypothesisSet &GetHypotheses() const;
size_t GetHypothesesSize() const;
const BackwardsEdgeSet &GetBackwardsEdges();
void InitializeEdges();
void ProcessBestHypothesis();
void EnsureMinStackHyps(const size_t minNumHyps);
void AddHypothesis(Hypothesis *hypothesis);
void AddBackwardsEdge(BackwardsEdge *edge);
void SortHypotheses();
};
}
#endif

245
src/ConfusionNet.cpp Normal file
View File

@ -0,0 +1,245 @@
// $Id: ConfusionNet.cpp 2935 2010-02-24 10:30:24Z jfouet $
#include "ConfusionNet.h"
#include <sstream>
#include "FactorCollection.h"
#include "Util.h"
#include "PhraseDictionaryTreeAdaptor.h"
#include "TranslationOptionCollectionConfusionNet.h"
#include "StaticData.h"
#include "Sentence.h"
#include "UserMessage.h"
namespace Moses
{
struct CNStats {
size_t created,destr,read,colls,words;
CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
~CNStats() {print(std::cerr);}
void createOne() {++created;}
void destroyOne() {++destr;}
void collect(const ConfusionNet& cn)
{
++read;
colls+=cn.GetSize();
for(size_t i=0;i<cn.GetSize();++i)
words+=cn[i].size();
}
void print(std::ostream& out) const
{
if(created>0)
{
out<<"confusion net statistics:\n"
" created:\t"<<created<<"\n"
" destroyed:\t"<<destr<<"\n"
" succ. read:\t"<<read<<"\n"
" columns:\t"<<colls<<"\n"
" words:\t"<<words<<"\n"
" avg. word/column:\t"<<words/(1.0*colls)<<"\n"
" avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
"\n\n";
}
}
};
CNStats stats;
size_t ConfusionNet::GetColumnIncrement(size_t i, size_t j) const
{
(void) i;
(void) j;
return 1;
}
ConfusionNet::ConfusionNet()
: InputType()
{
stats.createOne();
}
ConfusionNet::~ConfusionNet() {stats.destroyOne();}
ConfusionNet::ConfusionNet(Sentence const& s)
{
data.resize(s.GetSize());
for(size_t i=0;i<s.GetSize();++i)
data[i].push_back(std::make_pair(s.GetWord(i),0.0));
}
bool ConfusionNet::ReadF(std::istream& in,
const std::vector<FactorType>& factorOrder,
int format)
{
VERBOSE(1, "read confusion net with format "<<format<<"\n");
switch(format)
{
case 0: return ReadFormat0(in,factorOrder);
case 1: return ReadFormat1(in,factorOrder);
default:
stringstream strme;
strme << "ERROR: unknown format '"<<format
<<"' in ConfusionNet::Read";
UserMessage::Add(strme.str());
}
return false;
}
int ConfusionNet::Read(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
int rv=ReadF(in,factorOrder,0);
if(rv) stats.collect(*this);
return rv;
}
void ConfusionNet::String2Word(const std::string& s,Word& w,
const std::vector<FactorType>& factorOrder)
{
std::vector<std::string> factorStrVector = Tokenize(s, "|");
for(size_t i=0;i<factorOrder.size();++i)
w.SetFactor(factorOrder[i],
FactorCollection::Instance().AddFactor(Input,factorOrder[i],
factorStrVector[i]));
}
bool ConfusionNet::ReadFormat0(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
Clear();
std::string line;
size_t numLinkParams = StaticData::Instance().GetNumLinkParams();
size_t numLinkWeights = StaticData::Instance().GetNumInputScores();
bool addRealWordCount = ((numLinkParams + 1) == numLinkWeights);
while(getline(in,line)) {
std::istringstream is(line);
std::string word;
Column col;
while(is>>word) {
Word w;
String2Word(word,w,factorOrder);
std::vector<float> probs(numLinkWeights,0.0);
for(size_t i=0;i<numLinkParams;i++) {
double prob;
if (!(is>>prob)) {
TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
return false;
}
if(prob<0.0)
{
VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
prob=0.0;
}
else if (prob>1.0)
{
VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
prob=1.0;
}
probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
}
//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
if (addRealWordCount && word!=EPSILON && word!="")
probs[numLinkParams] = -1.0;
col.push_back(std::make_pair(w,probs));
}
if(col.size()) {
data.push_back(col);
ShrinkToFit(data.back());
}
else break;
}
return !data.empty();
}
bool ConfusionNet::ReadFormat1(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
size_t s;
if(getline(in,line)) s=atoi(line.c_str()); else return 0;
data.resize(s);
for(size_t i=0;i<data.size();++i) {
if(!getline(in,line)) return 0;
std::istringstream is(line);
if(!(is>>s)) return 0;
std::string word;double prob;
data[i].resize(s);
for(size_t j=0;j<s;++j)
if(is>>word>>prob) {
//TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
data[i][j].second = std::vector<float> (1);
data[i][j].second.push_back((float) log(prob));
if(data[i][j].second[0]<0) {
VERBOSE(1, "WARN: neg costs: "<<data[i][j].second[0]<<" -> set to 0\n");
data[i][j].second[0]=0.0;}
String2Word(word,data[i][j].first,factorOrder);
} else return 0;
}
return !data.empty();
}
void ConfusionNet::Print(std::ostream& out) const {
out<<"conf net: "<<data.size()<<"\n";
for(size_t i=0;i<data.size();++i) {
out<<i<<" -- ";
for(size_t j=0;j<data[i].size();++j) {
out<<"("<<data[i][j].first.ToString()<<", ";
for(std::vector<float>::const_iterator scoreIterator = data[i][j].second.begin();scoreIterator<data[i][j].second.end();scoreIterator++) {
out<<", "<<*scoreIterator;
}
out<<") ";
}
out<<"\n";
}
out<<"\n\n";
}
#ifdef _WIN32
#pragma warning(disable:4716)
#endif
Phrase ConfusionNet::GetSubString(const WordsRange&) const {
TRACE_ERR("ERROR: call to ConfusionNet::GetSubString\n");
abort();
//return Phrase(Input);
}
std::string ConfusionNet::GetStringRep(const vector<FactorType> factorsToPrint) const{ //not well defined yet
TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
return "";
}
#ifdef _WIN32
#pragma warning(disable:4716)
#endif
const Word& ConfusionNet::GetWord(size_t) const {
TRACE_ERR("ERROR: call to ConfusionNet::GetFactorArray\n");
abort();
}
#ifdef _WIN32
#pragma warning(default:4716)
#endif
std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
{
cn.Print(out);return out;
}
TranslationOptionCollection*
ConfusionNet::CreateTranslationOptionCollection() const
{
size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage();
float translationOptionThreshold = StaticData::Instance().GetTranslationOptionThreshold();
TranslationOptionCollection *rv= new TranslationOptionCollectionConfusionNet(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
assert(rv);
return rv;
}
}

63
src/ConfusionNet.h Normal file
View File

@ -0,0 +1,63 @@
// $Id: ConfusionNet.h 2939 2010-02-24 11:15:44Z jfouet $
#ifndef moses_ConfusionNet_h
#define moses_ConfusionNet_h
#include <vector>
#include <iostream>
#include "Word.h"
#include "InputType.h"
namespace Moses
{
class FactorCollection;
class TranslationOptionCollection;
class Sentence;
class ConfusionNet : public InputType {
public:
typedef std::vector<std::pair<Word,std::vector<float> > > Column;
protected:
std::vector<Column> data;
bool ReadFormat0(std::istream&,const std::vector<FactorType>& factorOrder);
bool ReadFormat1(std::istream&,const std::vector<FactorType>& factorOrder);
void String2Word(const std::string& s,Word& w,const std::vector<FactorType>& factorOrder);
public:
ConfusionNet();
virtual ~ConfusionNet();
ConfusionNet(Sentence const& s);
InputTypeEnum GetType() const
{ return ConfusionNetworkInput;}
const Column& GetColumn(size_t i) const {assert(i<data.size());return data[i];}
const Column& operator[](size_t i) const {return GetColumn(i);}
virtual size_t GetColumnIncrement(size_t i, size_t j) const; //! returns 1 for CNs
bool Empty() const {return data.empty();}
size_t GetSize() const {return data.size();}
void Clear() {data.clear();}
bool ReadF(std::istream&,const std::vector<FactorType>& factorOrder,int format=0);
virtual void Print(std::ostream&) const;
int Read(std::istream& in,const std::vector<FactorType>& factorOrder);
Phrase GetSubString(const WordsRange&) const; //TODO not defined
std::string GetStringRep(const std::vector<FactorType> factorsToPrint) const; //TODO not defined
const Word& GetWord(size_t pos) const;
TranslationOptionCollection* CreateTranslationOptionCollection() const;
};
std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn);
}
#endif

366
src/DPR_reordering.cpp Normal file
View File

@ -0,0 +1,366 @@
/*
**********************************************************
Cpp file ---------- DPR_reordering.cpp
The reordering feature function for MOSES
based on the DPR model proposed in (Ni et al., 2009)
Components:
vector<unsigned long long> m_dprOptionStartPOS --- store the start pos for each sentence option (to read from the .txt file)
ifstream sentenceOptionFile --- the stream file storing the sentence options
int sentenceID --- the sentence ID (indicating which sentence option block is used)
mapPhraseOption sentencePhraseOption --- sentence phrase option <left bound, right bound> -> target (string) -> probs
Functions:
0. Constructor: DPR_reordering(ScoreIndexManager &scoreIndexManager, const std::string &filePath, const std::vector<float>& weights)
1. interface functions:
GetNumScoreComponents() --- return the number of scores the component used (usually 1)
GetScoreProducerDescription() --- return the name of the reordering model
GetScoreProducerWeightShortName() --- return the short name of the weight for the score
2. Score producers:
Evaluate() --- to evaluate the reordering scores and add the score to the score component collection
EmptyHypothesisState() --- create an empty hypothesis
3. Other functions:
constructSentencePhraseOption() --- Construct sentencePhraseOption using sentenceID
clearSentencePhraseOption() --- clear the sentence phrase options
**********************************************************
*/
#include "DPR_reordering.h"
namespace Moses
{
/*
1. constructor
*/
DPR_reordering::DPR_reordering(ScoreIndexManager &scoreIndexManager, const string filePath, const string classString, const vector<float>& weights)
{
//1. Add the function in the scoreIndexManager
scoreIndexManager.AddScoreProducer(this);
//2. Set the weight for this score producer
const_cast<StaticData&>(StaticData::Instance()).SetWeightsForScoreProducer(this, weights);
//3. Get the class setup
istringstream tempClassSetup(classString);
tempClassSetup>>classSetup;
if (classSetup==3)
{
for (int k=0; k<25; k++)
WDR_cost.push_back(log10(exp(-(float) k)));
unDetectProb = 0.3333;
}
else if (classSetup==5)
unDetectProb = log10(0.2);
else
cerr<<"Error in DPR_reordering: Currently there is no class setup: "<<classSetup<<" in our model.\n";
//4. get the start position of the sentence options
string fileStartPos = filePath+".startPosition"; //path of the sentence start position file
ifstream sentencePOS((char*) fileStartPos.c_str(),ios::binary);
string eachLine;
while (getline(sentencePOS,eachLine,'\n'))
{
istringstream tempString(eachLine);
unsigned long long tempValue;
tempString>>tempValue;
m_dprOptionStartPOS.push_back(tempValue); //Get the start position of each sentence option DB
}
//5. Read the first sentence option
sentenceID=0;
sentenceOptionFile.open((char*) filePath.c_str(),ios::binary);
if (!sentenceOptionFile.is_open())
cerr<<"Error in DPR_reordering.cpp: can not open the sentence options file!\n";
else
constructSentencePhraseOption(); //construct the first sentencePhraseOption
sentencePOS.close();
}
/*
2. interface functions
*/
//return the number of score components
size_t DPR_reordering::GetNumScoreComponents() const
{
return 1;
}
//return the description of this feature function
string DPR_reordering::GetScoreProducerDescription() const
{
return "Distance_phrase_reordering_probabilities_produders";
}
//return the weight short name
string DPR_reordering::GetScoreProducerWeightShortName() const
{
return "weight-DPR";
}
/*
3. the score producers
*/
const FFState* DPR_reordering::EmptyHypothesisState() const
{
//Do nothing
return NULL;
}
//given the hypothesis (and previous hypothesis) computed and add the reordering score
FFState* DPR_reordering::Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator)
{
//cerr << cur_hypo.GetInput();
//cerr << cur_hypo.GetInput().GetTranslationId();
//1. Check the sentence phrase option (check the ID starts from 0 or 1?)
long int currentSentenceID = cur_hypo.GetInput().GetTranslationId();
if (sentenceID!=currentSentenceID)
{
sentenceID=currentSentenceID;
constructSentencePhraseOption(); //construct the first sentencePhraseOption
}
//2. get the information current phrase: left_boundary, right_boundary, target translation
// prev phrase: right_boundary
size_t prev_right_boundary;
size_t curr_left_boundary;
size_t curr_right_boundary;
const Hypothesis* prevHypothesis = cur_hypo.GetPrevHypo();
//check if there is a previous hypo
if (prevHypothesis->GetId()==0)
prev_right_boundary=-1;
else
prev_right_boundary=prevHypothesis->GetCurrSourceWordsRange().GetEndPos();
const WordsRange currWordsRange = cur_hypo.GetCurrSourceWordsRange();
curr_left_boundary = currWordsRange.GetStartPos();
curr_right_boundary = currWordsRange.GetEndPos();
string targetTranslation = cur_hypo.GetCurrTargetPhrase().ToString();
//3. Get the reordering probability
float reorderingProb = generateReorderingProb(curr_left_boundary, curr_right_boundary, prev_right_boundary, targetTranslation);
//simple, update the score -1.0
accumulator->PlusEquals(this,reorderingProb);
return NULL;
}
/*
4. Other functions
*/
/*
4.1 Clear the content in sentencePhraseOption
*/
void DPR_reordering::clearSentencePhraseOption()
{
for (mapPhraseOption::iterator iterator = sentencePhraseOption.begin(); iterator!= sentencePhraseOption.end(); iterator++)
{
iterator->second.clear(); //clear each map in mapTargetProbOption
}
sentencePhraseOption.clear(); //clear the components in sentencePhraseOption
}
/*
4.2 Construct sentencePhraseOption using sentenceID
*/
void DPR_reordering::constructSentencePhraseOption()
{
//1. Get the start position of the sentence options
sentenceOptionFile.seekg(m_dprOptionStartPOS[sentenceID],ios::beg); //set the offset
string eachSentence;
getline(sentenceOptionFile,eachSentence,'\n');
//2. Search each separation
size_t boundaryFound = eachSentence.find(" ::: "); //find the separation between the boundary and the values
size_t boundaryFound_end; //find the end of the boundary
int countBoundaryOption=0;
while (boundaryFound!=string::npos)
{
//2.1 Get the boundary (create a phraseOption map)
vector<unsigned short> boundary; //store the boundary
unsigned short boundary_int;
string tempString; //store the boundary
if (countBoundaryOption==0)
tempString=eachSentence.substr(0,boundaryFound); //get the boundary string
else
tempString=eachSentence.substr(boundaryFound_end+5,boundaryFound-boundaryFound_end-5);
istringstream boundaryString(tempString);
while (boundaryString>>boundary_int)
boundary.push_back(boundary_int);
//2.2 Get the target string (all target transaltions)
boundaryFound_end=eachSentence.find(" ;;; ",boundaryFound+5);
string targetString=eachSentence.substr(boundaryFound+5,boundaryFound_end-boundaryFound-5);
size_t targetFound=targetString.find(" ||| ");
size_t probFound=targetString.find(" ||| ",targetFound+5);
size_t probFound_prev; //store the previous probs position
int countPhraseOption=0;
while (targetFound!=string::npos)
{
if (probFound==string::npos)
probFound=targetString.size();
string target; //store each target phrase
string tempProbString; //store the probability string
vector<float> tempProbs; //store the probabilities
float probValue; //store the probability value
//2.3 Get each target string
if (countPhraseOption==0)
target = targetString.substr(0,targetFound);
else
target= targetString.substr(probFound_prev+5,targetFound-probFound_prev-5);
//2.4 Get the probability vector
tempProbString=targetString.substr(targetFound+5,probFound-targetFound-5);
istringstream probString(tempProbString);
while(probString>>probValue)
{
if (classSetup==5)
probValue=log10(probValue); //get the log probability
tempProbs.push_back(probValue);
}
//2.5 Update the information
sentencePhraseOption[boundary][target]=tempProbs;
countPhraseOption++;
probFound_prev=probFound;
targetFound=targetString.find(" ||| ",probFound+5);
if (targetFound!=string::npos)
probFound=targetString.find(" ||| ",targetFound+5);
}
//3. Get the next boundary
boundaryFound_end=boundaryFound;
countBoundaryOption++;
boundaryFound=eachSentence.find(" ::: ",boundaryFound_end+5); //Get next boundary found
}
}
/*
4.3 generate the reordering probability
*/
float DPR_reordering::generateReorderingProb(size_t boundary_left, size_t boundary_right, size_t prev_boundary_right, string targetPhrase)
{
float reorderingProb;
//1. get the distance reordering
int reorderDistance = prev_boundary_right+1-boundary_left; //reordering distance
int reorderOrientation = createOrientationClass(reorderDistance); //reordering orientation
//2. get the boundary vector
vector<unsigned short> phrase_boundary;
phrase_boundary.push_back(boundary_left);
phrase_boundary.push_back(boundary_right);
mapPhraseOption::const_iterator boundaryFound = sentencePhraseOption.find(phrase_boundary);
//3.1 If no this source phrase (then return equal probability)
if (boundaryFound==sentencePhraseOption.end())
{
if (classSetup==3)
{
reorderingProb = WDR_cost[abs(reorderDistance)]; //using word-based distance reordering
}
else if (classSetup==5)
{
reorderingProb=unDetectProb;
}
}
else
{
mapTargetProbOption::const_iterator targetFound = boundaryFound->second.find(targetPhrase);
//3.2 if no this target phrase
if (targetFound == boundaryFound->second.end())
{
if (classSetup==3)
{
reorderingProb = WDR_cost[abs(reorderDistance)]; //using word-based distance reordering
}
else if (classSetup==5)
{
reorderingProb=unDetectProb;
}
}
//3.3 else, get normal reordering probability
else
{
if (classSetup ==3)
{
if (reorderOrientation==1) //special case: monotone
{
if (targetFound->second[1]>0.5)
reorderingProb=0.0;
else
{
float ratio=min(MAXRATIO, 1.0/(3*targetFound->second[1]));
reorderingProb=ratio*WDR_cost[1];
}
}
else
{
float ratio=min(MAXRATIO, 1.0/(3*targetFound->second[reorderOrientation]));
reorderingProb=ratio*WDR_cost[abs(reorderDistance)];
}
}
else if (classSetup==5)
{
reorderingProb=targetFound->second[reorderOrientation];
}
}
}
return reorderingProb;
}
/*
4.4. int createOrientationClass(int dist,int classSetup) --- the create the orientation class
*/
int DPR_reordering::createOrientationClass(int dist)
{
int orientationClass;
//If three-class setup
if (classSetup==3)
{
if (dist<0)
orientationClass=0;
else if (dist==0)
orientationClass=1;
else
orientationClass=2;
}
else if (classSetup==5)
{
if (dist<=-5)
orientationClass=0;
else if (dist>-5 and dist<0)
orientationClass=1;
else if (dist==0)
orientationClass=2;
else if (dist>0 and dist<5)
orientationClass=3;
else
orientationClass=4;
}
else
{
cerr<<"Error in DPR_reordering: Currently there is no class setup: "<<classSetup<<" in our model.\n";
}
return orientationClass; //return the orientation class
}
DPR_reordering::~DPR_reordering()
{
sentenceOptionFile.close();
}
} // namespace

99
src/DPR_reordering.h Normal file
View File

@ -0,0 +1,99 @@
/*
**********************************************************
Head file ---------- DPR_reordering.h
The reordering feature function for MOSES
based on the DPR model proposed in (Ni et al., 2009)
Components:
vector<unsigned long long> m_dprOptionStartPOS --- store the start pos for each sentence option (to read from the .txt file)
ifstream sentenceOptionFile --- the stream file storing the sentence options
int sentenceID --- the sentence ID (indicating which sentence option block is used)
mapPhraseOption sentencePhraseOption --- sentence phrase option <left bound, right bound> -> target (string) -> probs
Functions:
0. Constructor: DPR_reordering(ScoreIndexManager &scoreIndexManager, const std::string &filePath, const std::vector<float>& weights)
1. interface functions:
GetNumScoreComponents() --- return the number of scores the component used (usually 1)
GetScoreProducerDescription() --- return the name of the reordering model
GetScoreProducerWeightShortName() --- return the short name of the weight for the score
2. Score producers:
Evaluate() --- to evaluate the reordering scores and add the score to the score component collection
EmptyHypothesisState() --- create an empty hypothesis
3. Other functions:
constructSentencePhraseOption() --- Construct sentencePhraseOption using sentenceID
clearSentencePhraseOption() --- clear the sentence phrase options
**********************************************************
*/
#pragma once
#ifndef DPR_REORDERING_H
#define DPR_REORDERING_H
#include <cstdlib>
#include <map>
#include <iostream>
#include <vector>
#include <string>
#include <sstream> //using istringstream
#include <fstream> //using ifstream
#include <math.h>
#include "FeatureFunction.h"
#include "Hypothesis.h"
#include "WordsRange.h"
#include "StaticData.h"
#include "InputType.h"
#define MAXRATIO 3.0 //the maximum ration for the 3-class setup
/*
#ifdef __GNUC__
#include <ext/hash_map>
#else
#include <hash_map>
#endif
namespace std{using namespace __gnu_cxx;}*/
using namespace std;
using std::ifstream;
using std::istringstream;
using std::vector;
using std::string;
//for sentencePhraseOption
typedef std::map<vector<unsigned short>, map<string, vector<float> > > mapPhraseOption;
typedef std::map<string, vector<float> > mapTargetProbOption;
namespace Moses
{
using namespace std;
//define the class DPR_reordering
class DPR_reordering : public StatefulFeatureFunction
{
public:
//constructor
DPR_reordering(ScoreIndexManager &scoreIndexManager, const string filePath, const string classString, const vector<float>& weights);
~DPR_reordering();
public:
//interface: include 3 functions
size_t GetNumScoreComponents() const; //return the number of scores the component used
string GetScoreProducerDescription() const; //return the name of the reordering model
string GetScoreProducerWeightShortName() const; //return the short name of the weight for the score
public:
//The evaluation function and score calculation function
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator);
const FFState* EmptyHypothesisState() const;
public:
void clearSentencePhraseOption(); //clear the sentence phrase options
void constructSentencePhraseOption(); //construct sentence phrase options (for a sentence)
float generateReorderingProb(size_t boundary_left, size_t boundary_right, size_t prev_boundary_right, string targetPhrase); //generate the reordering probability
int createOrientationClass(int dist); //the create the orientation class
private:
vector<unsigned long long> m_dprOptionStartPOS; //store the start pos for each sentence option
ifstream sentenceOptionFile; //the ifstream file of the sentenceOption
long int sentenceID; //store the ID of current sentence needed translation
mapPhraseOption sentencePhraseOption; //store the phrase option for each sentence
int classSetup; //store the number of orientations
float unDetectProb; //the const reodering prob if the phrase pair is not in sentence option
vector<float> WDR_cost; //the word distance reodering cost
};
};
#endif

36
src/DecodeGraph.cpp Normal file
View File

@ -0,0 +1,36 @@
// $Id: TranslationOptionCollection.cpp 1429 2007-07-20 13:03:12Z hieuhoang1972 $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "DecodeGraph.h"
#include "DecodeStep.h"
#include "TypeDef.h"
#include "Util.h"
namespace Moses
{
DecodeGraph::~DecodeGraph()
{
RemoveAllInColl(m_steps);
}
}

68
src/DecodeGraph.h Normal file
View File

@ -0,0 +1,68 @@
// $Id: TranslationOptionCollection.cpp 1429 2007-07-20 13:03:12Z hieuhoang1972 $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_DecodeGraph_h
#define moses_DecodeGraph_h
#include <list>
#include <iterator>
namespace Moses
{
class DecodeStep;
//! list of DecodeStep s which factorizes the translation
class DecodeGraph
{
protected:
std::list<const DecodeStep*> m_steps;
size_t m_position;
public:
/**
* position: The position of this graph within the decode sequence.
**/
DecodeGraph(size_t position): m_position(position) {}
//! iterators
typedef std::list<const DecodeStep*>::iterator iterator;
typedef std::list<const DecodeStep*>::const_iterator const_iterator;
const_iterator begin() const { return m_steps.begin(); }
const_iterator end() const { return m_steps.end(); }
size_t GetPosition() const
{
return m_position;
}
~DecodeGraph();
//! Add another decode step to the graph
void Add(const DecodeStep *decodeStep)
{
m_steps.push_back(decodeStep);
}
};
}
#endif

66
src/DecodeStep.cpp Normal file
View File

@ -0,0 +1,66 @@
// $Id: DecodeStep.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "DecodeStep.h"
#include "PhraseDictionaryMemory.h"
#include "GenerationDictionary.h"
#include "StaticData.h"
namespace Moses
{
DecodeStep::DecodeStep(Dictionary *ptr, const DecodeStep* prev)
:m_ptr(ptr)
{
FactorMask prevOutputFactors;
if (prev) prevOutputFactors = prev->m_outputFactors;
m_outputFactors = prevOutputFactors;
FactorMask conflictMask = (m_outputFactors & ptr->GetOutputFactorMask());
m_outputFactors |= ptr->GetOutputFactorMask();
FactorMask newOutputFactorMask = m_outputFactors ^ prevOutputFactors; //xor
m_newOutputFactors.resize(newOutputFactorMask.count());
m_conflictFactors.resize(conflictMask.count());
size_t j=0, k=0;
for (size_t i = 0; i < MAX_NUM_FACTORS; i++) {
if (newOutputFactorMask[i]) m_newOutputFactors[j++] = i;
if (conflictMask[i]) m_conflictFactors[k++] = i;
}
VERBOSE(2,"DecodeStep():\n\toutputFactors=" << m_outputFactors
<< "\n\tconflictFactors=" << conflictMask
<< "\n\tnewOutputFactors=" << newOutputFactorMask << std::endl);
}
DecodeStep::~DecodeStep() {}
/** returns phrase table (dictionary) for translation step */
const PhraseDictionary &DecodeStep::GetPhraseDictionary() const
{
return *static_cast<const PhraseDictionary*>(m_ptr);
}
/** returns generation table (dictionary) for generation step */
const GenerationDictionary &DecodeStep::GetGenerationDictionary() const
{
return *static_cast<const GenerationDictionary*>(m_ptr);
}
}

113
src/DecodeStep.h Normal file
View File

@ -0,0 +1,113 @@
// $Id: DecodeStep.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_DecodeStep_h
#define moses_DecodeStep_h
#include <cassert>
#include "TypeDef.h"
#include "Dictionary.h"
namespace Moses
{
class PhraseDictionary;
class GenerationDictionary;
class TranslationOption;
class TranslationOptionCollection;
class PartialTranslOptColl;
class FactorCollection;
class InputType;
/*! Specification for a decoding step.
* The factored translation model consists of Translation and Generation
* steps, which consult a Dictionary of phrase translations or word
* generations. This class implements the specification for one of these
* steps, both the DecodeType and a pointer to the Dictionary
**/
class DecodeStep
{
protected:
const Dictionary *m_ptr; //! pointer to translation/generation table
FactorMask m_outputFactors; //! mask of what factors exist on the output side after this decode step
std::vector<FactorType> m_conflictFactors; //! list of the factors that may conflict during this step
std::vector<FactorType> m_newOutputFactors; //! list of the factors that are new in this step, may be empty
public:
DecodeStep(); //! not implemented
DecodeStep(Dictionary *ptr, const DecodeStep* prevDecodeStep);
virtual ~DecodeStep();
//! mask of factors that are present after this decode step
const FactorMask& GetOutputFactorMask() const
{
return m_outputFactors;
}
//! returns true if this decode step must match some pre-existing factors
bool IsFilteringStep() const
{
return !m_conflictFactors.empty();
}
//! returns true if this decode step produces one or more new factors
bool IsFactorProducingStep() const
{
return !m_newOutputFactors.empty();
}
/*! returns a list (possibly empty) of the (target side) factors that
* are produced in this decoding step. For example, if a previous step
* generated factor 1, and this step generates 1,2, then only 2 will be
* in the returned vector. */
const std::vector<FactorType>& GetNewOutputFactors() const
{
return m_newOutputFactors;
}
/*! returns a list (possibly empty) of the (target side) factors that
* are produced BUT ALREADY EXIST and therefore must be checked for
* conflict or compatibility */
const std::vector<FactorType>& GetConflictFactors() const
{
return m_conflictFactors;
}
/*! returns phrase table (dictionary) for translation step */
const PhraseDictionary &GetPhraseDictionary() const;
/*! returns generation table (dictionary) for generation step */
const GenerationDictionary &GetGenerationDictionary() const;
/*! returns dictionary in abstract class */
const Dictionary* GetDictionaryPtr() const {return m_ptr;}
/*! Given an input TranslationOption, extend it in some way (put results in outputPartialTranslOptColl) */
virtual void Process(const TranslationOption &inputPartialTranslOpt
, const DecodeStep &decodeStep
, PartialTranslOptColl &outputPartialTranslOptColl
, TranslationOptionCollection *toc
, bool adhereTableLimit) const = 0;
};
}
#endif

View File

@ -0,0 +1,176 @@
// $Id: DecodeStepGeneration.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "DecodeStepGeneration.h"
#include "GenerationDictionary.h"
#include "TranslationOption.h"
#include "TranslationOptionCollection.h"
#include "PartialTranslOptColl.h"
#include "FactorCollection.h"
namespace Moses
{
DecodeStepGeneration::DecodeStepGeneration(GenerationDictionary* dict, const DecodeStep* prev)
: DecodeStep(dict, prev)
{
}
const GenerationDictionary &DecodeStepGeneration::GetGenerationDictionary() const
{
return *static_cast<const GenerationDictionary*>(m_ptr);
}
TranslationOption *DecodeStepGeneration::MergeGeneration(const TranslationOption& oldTO, Phrase &mergePhrase
, const ScoreComponentCollection& generationScore) const
{
if (IsFilteringStep()) {
if (!oldTO.IsCompatible(mergePhrase, m_conflictFactors))
return NULL;
}
TranslationOption *newTransOpt = new TranslationOption(oldTO);
newTransOpt->MergeNewFeatures(mergePhrase, generationScore, m_newOutputFactors);
return newTransOpt;
}
// helpers
typedef pair<Word, ScoreComponentCollection> WordPair;
typedef list< WordPair > WordList;
// 1st = word
// 2nd = score
typedef list< WordPair >::const_iterator WordListIterator;
/** used in generation: increases iterators when looping through the exponential number of generation expansions */
inline void IncrementIterators(vector< WordListIterator > &wordListIterVector
, const vector< WordList > &wordListVector)
{
for (size_t currPos = 0 ; currPos < wordListVector.size() ; currPos++)
{
WordListIterator &iter = wordListIterVector[currPos];
iter++;
if (iter != wordListVector[currPos].end())
{ // eg. 4 -> 5
return;
}
else
{ // eg 9 -> 10
iter = wordListVector[currPos].begin();
}
}
}
void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOpt
, const DecodeStep &decodeStep
, PartialTranslOptColl &outputPartialTranslOptColl
, TranslationOptionCollection *toc
, bool adhereTableLimit) const
{
if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0)
{ // word deletion
TranslationOption *newTransOpt = new TranslationOption(inputPartialTranslOpt);
outputPartialTranslOptColl.Add(newTransOpt);
return;
}
// normal generation step
const GenerationDictionary &generationDictionary = decodeStep.GetGenerationDictionary();
// const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
const Phrase &targetPhrase = inputPartialTranslOpt.GetTargetPhrase();
size_t targetLength = targetPhrase.GetSize();
// generation list for each word in phrase
vector< WordList > wordListVector(targetLength);
// create generation list
int wordListVectorPos = 0;
for (size_t currPos = 0 ; currPos < targetLength ; currPos++) // going thorugh all words
{
// generatable factors for this word to be put in wordList
WordList &wordList = wordListVector[wordListVectorPos];
const Word &word = targetPhrase.GetWord(currPos);
// consult dictionary for possible generations for this word
const OutputWordCollection *wordColl = generationDictionary.FindWord(word);
if (wordColl == NULL)
{ // word not found in generation dictionary
//toc->ProcessUnknownWord(sourceWordsRange.GetStartPos(), factorCollection);
return; // can't be part of a phrase, special handling
}
else
{
// sort(*wordColl, CompareWordCollScore);
OutputWordCollection::const_iterator iterWordColl;
for (iterWordColl = wordColl->begin() ; iterWordColl != wordColl->end(); ++iterWordColl)
{
const Word &outputWord = (*iterWordColl).first;
const ScoreComponentCollection& score = (*iterWordColl).second;
// enter into word list generated factor(s) and its(their) score(s)
wordList.push_back(WordPair(outputWord, score));
}
wordListVectorPos++; // done, next word
}
}
// use generation list (wordList)
// set up iterators (total number of expansions)
size_t numIteration = 1;
vector< WordListIterator > wordListIterVector(targetLength);
vector< const Word* > mergeWords(targetLength);
for (size_t currPos = 0 ; currPos < targetLength ; currPos++)
{
wordListIterVector[currPos] = wordListVector[currPos].begin();
numIteration *= wordListVector[currPos].size();
}
// go thru each possible factor for each word & create hypothesis
for (size_t currIter = 0 ; currIter < numIteration ; currIter++)
{
ScoreComponentCollection generationScore; // total score for this string of words
// create vector of words with new factors for last phrase
for (size_t currPos = 0 ; currPos < targetLength ; currPos++)
{
const WordPair &wordPair = *wordListIterVector[currPos];
mergeWords[currPos] = &(wordPair.first);
generationScore.PlusEquals(wordPair.second);
}
// merge with existing trans opt
Phrase genPhrase(Output, mergeWords);
TranslationOption *newTransOpt = MergeGeneration(inputPartialTranslOpt, genPhrase, generationScore);
if (newTransOpt != NULL)
{
outputPartialTranslOptColl.Add(newTransOpt);
}
// increment iterators
IncrementIterators(wordListIterVector, wordListVector);
}
}
}

View File

@ -0,0 +1,60 @@
// $Id: DecodeStepGeneration.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_DecodeStepGeneration_h
#define moses_DecodeStepGeneration_h
#include "DecodeStep.h"
namespace Moses
{
class GenerationDictionary;
class Phrase;
class ScoreComponentCollection;
//! subclass of DecodeStep for generation step
class DecodeStepGeneration : public DecodeStep
{
public:
DecodeStepGeneration(GenerationDictionary* dict, const DecodeStep* prev);
//! returns phrase table (dictionary) for translation step
const GenerationDictionary &GetGenerationDictionary() const;
virtual void Process(const TranslationOption &inputPartialTranslOpt
, const DecodeStep &decodeStep
, PartialTranslOptColl &outputPartialTranslOptColl
, TranslationOptionCollection *toc
, bool adhereTableLimit) const;
private:
/*! create new TranslationOption from merging oldTO with mergePhrase
This function runs IsCompatible() to ensure the two can be merged
*/
TranslationOption *MergeGeneration(const TranslationOption& oldTO, Phrase &mergePhrase
, const ScoreComponentCollection& generationScore) const;
};
}
#endif

View File

@ -0,0 +1,136 @@
// $Id: DecodeStepTranslation.cpp 2477 2009-08-07 16:47:54Z bhaddow $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "DecodeStepTranslation.h"
#include "PhraseDictionaryMemory.h"
#include "TranslationOption.h"
#include "TranslationOptionCollection.h"
#include "PartialTranslOptColl.h"
#include "FactorCollection.h"
namespace Moses
{
DecodeStepTranslation::DecodeStepTranslation(PhraseDictionary* dict, const DecodeStep* prev)
: DecodeStep(dict, prev), m_phraseDictionary(dict)
{
}
/*const PhraseDictionary &DecodeStepTranslation::GetPhraseDictionary() const
{
return *m_phraseDictionary;
}*/
TranslationOption *DecodeStepTranslation::MergeTranslation(const TranslationOption& oldTO, const TargetPhrase &targetPhrase) const
{
if (IsFilteringStep()) {
if (!oldTO.IsCompatible(targetPhrase, m_conflictFactors)) return 0;
}
TranslationOption *newTransOpt = new TranslationOption(oldTO);
newTransOpt->MergeNewFeatures(targetPhrase, targetPhrase.GetScoreBreakdown(), m_newOutputFactors);
return newTransOpt;
}
void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslOpt
, const DecodeStep &decodeStep
, PartialTranslOptColl &outputPartialTranslOptColl
, TranslationOptionCollection *toc
, bool adhereTableLimit) const
{
if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0)
{ // word deletion
outputPartialTranslOptColl.Add(new TranslationOption(inputPartialTranslOpt));
return;
}
// normal trans step
const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
const PhraseDictionary &phraseDictionary = decodeStep.GetPhraseDictionary();
const size_t currSize = inputPartialTranslOpt.GetTargetPhrase().GetSize();
const size_t tableLimit = phraseDictionary.GetTableLimit();
const TargetPhraseCollection *phraseColl= phraseDictionary.GetTargetPhraseCollection(toc->GetSource(),sourceWordsRange);
if (phraseColl != NULL)
{
TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit;
for (iterTargetPhrase = phraseColl->begin(); iterTargetPhrase != iterEnd; ++iterTargetPhrase)
{
const TargetPhrase& targetPhrase = **iterTargetPhrase;
// skip if the
if (targetPhrase.GetSize() != currSize) continue;
TranslationOption *newTransOpt = MergeTranslation(inputPartialTranslOpt, targetPhrase);
if (newTransOpt != NULL)
{
outputPartialTranslOptColl.Add( newTransOpt );
}
}
}
else if (sourceWordsRange.GetNumWordsCovered() == 1)
{ // unknown handler
//toc->ProcessUnknownWord(sourceWordsRange.GetStartPos(), factorCollection);
}
}
void DecodeStepTranslation::ProcessInitialTranslation(
const InputType &source
,PartialTranslOptColl &outputPartialTranslOptColl
, size_t startPos, size_t endPos, bool adhereTableLimit) const
{
const size_t tableLimit = m_phraseDictionary->GetTableLimit();
const WordsRange wordsRange(startPos, endPos);
const TargetPhraseCollection *phraseColl = m_phraseDictionary->GetTargetPhraseCollection(source,wordsRange);
if (phraseColl != NULL)
{
IFVERBOSE(3) {
if(StaticData::Instance().GetInputType() == SentenceInput)
TRACE_ERR("[" << source.GetSubString(wordsRange) << "; " << startPos << "-" << endPos << "]\n");
else
TRACE_ERR("[" << startPos << "-" << endPos << "]" << std::endl);
}
TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit;
for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != iterEnd ; ++iterTargetPhrase)
{
const TargetPhrase &targetPhrase = **iterTargetPhrase;
outputPartialTranslOptColl.Add ( new TranslationOption(wordsRange, targetPhrase, source) );
VERBOSE(3,"\t" << targetPhrase << "\n");
}
VERBOSE(3,endl);
}
}
}

View File

@ -0,0 +1,67 @@
// $Id: DecodeStepTranslation.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_DecodeStepTranslation_h
#define moses_DecodeStepTranslation_h
#include "DecodeStep.h"
#include "PhraseDictionary.h"
namespace Moses
{
class PhraseDictionary;
class TargetPhrase;
//! subclass of DecodeStep for translation step
class DecodeStepTranslation : public DecodeStep
{
public:
DecodeStepTranslation(); //! not implemented
DecodeStepTranslation(PhraseDictionary* dict, const DecodeStep* prev);
//! returns phrase table (dictionary) for translation step
const PhraseDictionary &GetPhraseDictionary() const;
virtual void Process(const TranslationOption &inputPartialTranslOpt
, const DecodeStep &decodeStep
, PartialTranslOptColl &outputPartialTranslOptColl
, TranslationOptionCollection *toc
, bool adhereTableLimit) const;
/*! initialize list of partial translation options by applying the first translation step
* Ideally, this function should be in DecodeStepTranslation class
*/
void ProcessInitialTranslation(
const InputType &source
, PartialTranslOptColl &outputPartialTranslOptColl
, size_t startPos, size_t endPos, bool adhereTableLimit) const;
private:
/*! create new TranslationOption from merging oldTO with mergePhrase
This function runs IsCompatible() to ensure the two can be merged
*/
TranslationOption *MergeTranslation(const TranslationOption& oldTO, const TargetPhrase &targetPhrase) const;
PhraseDictionary* m_phraseDictionary;
};
}
#endif

38
src/Dictionary.cpp Normal file
View File

@ -0,0 +1,38 @@
// $Id: Dictionary.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "Dictionary.h"
#include "FactorTypeSet.h"
namespace Moses
{
Dictionary::Dictionary(size_t numScoreComponent)
:m_numScoreComponent(numScoreComponent)
{
}
Dictionary::~Dictionary() {}
void Dictionary::CleanUp() {}
}

68
src/Dictionary.h Normal file
View File

@ -0,0 +1,68 @@
// $Id: Dictionary.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Dictionary_h
#define moses_Dictionary_h
#include <vector>
#include "FactorTypeSet.h"
#include "ScoreProducer.h"
namespace Moses
{
/** Abstract class from which PhraseDictionary and GenerationDictionary
* are inherited.
*/
class Dictionary
{
protected:
const size_t m_numScoreComponent;
FactorMask m_inputFactors;
FactorMask m_outputFactors;
public:
//! Constructor
Dictionary(size_t numScoreComponent);
//!Destructor
virtual ~Dictionary();
//! returns output factor types as specified by the ini file
const FactorMask& GetOutputFactorMask() const
{
return m_outputFactors;
}
//! returns input factor types as specified by the ini file
const FactorMask& GetInputFactorMask() const
{
return m_inputFactors;
}
//! returns whether this dictionary is to be used for Translate or Generate
virtual DecodeType GetDecodeType() const = 0;
// clean up temporary memory, called after processing each sentence
virtual void CleanUp();
};
}
#endif

154
src/DummyScoreProducers.cpp Normal file
View File

@ -0,0 +1,154 @@
// $Id: DummyScoreProducers.cpp 2477 2009-08-07 16:47:54Z bhaddow $
#include <cassert>
#include "FFState.h"
#include "StaticData.h"
#include "DummyScoreProducers.h"
#include "WordsRange.h"
#include "TranslationOption.h"
namespace Moses
{
struct DistortionState_traditional : public FFState {
WordsRange range;
int first_gap;
DistortionState_traditional(const WordsRange& wr, int fg) : range(wr), first_gap(fg) {}
int Compare(const FFState& other) const {
const DistortionState_traditional& o =
static_cast<const DistortionState_traditional&>(other);
if (range.GetEndPos() < o.range.GetEndPos()) return -1;
if (range.GetEndPos() > o.range.GetEndPos()) return 1;
return 0;
}
};
struct DistortionState_MQ2007 : public FFState {
//TODO
};
const FFState* DistortionScoreProducer::EmptyHypothesisState() const {
return new DistortionState_traditional(WordsRange(NOT_FOUND,NOT_FOUND), NOT_FOUND);
}
DistortionScoreProducer::DistortionScoreProducer(ScoreIndexManager &scoreIndexManager)
{
scoreIndexManager.AddScoreProducer(this);
}
size_t DistortionScoreProducer::GetNumScoreComponents() const
{
return 1;
}
std::string DistortionScoreProducer::GetScoreProducerDescription() const
{
return "Distortion";
}
std::string DistortionScoreProducer::GetScoreProducerWeightShortName() const
{
return "d";
}
float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo,
const WordsRange &prev, const WordsRange &curr, const int FirstGap) const
{
const int USE_OLD = 1;
if (USE_OLD) {
return - (float) hypo.GetInput().ComputeDistortionDistance(prev, curr);
}
// Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007
int prefixEndPos = FirstGap-1;
if ((int) curr.GetStartPos() == prefixEndPos+1) {
return 0;
}
if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) {
return (float) -2*curr.GetNumWordsCovered();
}
if ((int) prev.GetEndPos() <= prefixEndPos) {
int z = curr.GetStartPos()-prefixEndPos;
return (float) -2*(z + curr.GetNumWordsCovered());
}
return (float) -2*(curr.GetNumWordsBetween(prev) + curr.GetNumWordsCovered());
}
size_t DistortionScoreProducer::GetNumInputScores() const { return 0;}
FFState* DistortionScoreProducer::Evaluate(
const Hypothesis& hypo,
const FFState* prev_state,
ScoreComponentCollection* out) const {
const DistortionState_traditional* prev = static_cast<const DistortionState_traditional*>(prev_state);
const float distortionScore = CalculateDistortionScore(
hypo,
prev->range,
hypo.GetCurrSourceWordsRange(),
prev->first_gap);
out->PlusEquals(this, distortionScore);
DistortionState_traditional* res = new DistortionState_traditional(
hypo.GetCurrSourceWordsRange(),
hypo.GetPrevHypo()->GetWordsBitmap().GetFirstGapPos());
return res;
}
WordPenaltyProducer::WordPenaltyProducer(ScoreIndexManager &scoreIndexManager)
{
scoreIndexManager.AddScoreProducer(this);
}
size_t WordPenaltyProducer::GetNumScoreComponents() const
{
return 1;
}
std::string WordPenaltyProducer::GetScoreProducerDescription() const
{
return "WordPenalty";
}
std::string WordPenaltyProducer::GetScoreProducerWeightShortName() const
{
return "w";
}
size_t WordPenaltyProducer::GetNumInputScores() const { return 0;}
void WordPenaltyProducer::Evaluate(const TargetPhrase& tp, ScoreComponentCollection* out) const
{
out->PlusEquals(this, -static_cast<float>(tp.GetSize()));
}
UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(ScoreIndexManager &scoreIndexManager)
{
scoreIndexManager.AddScoreProducer(this);
}
size_t UnknownWordPenaltyProducer::GetNumScoreComponents() const
{
return 1;
}
std::string UnknownWordPenaltyProducer::GetScoreProducerDescription() const
{
return "!UnknownWordPenalty";
}
std::string UnknownWordPenaltyProducer::GetScoreProducerWeightShortName() const
{
return "u";
}
size_t UnknownWordPenaltyProducer::GetNumInputScores() const { return 0;}
bool UnknownWordPenaltyProducer::ComputeValueInTranslationOption() const {
return true;
}
}

70
src/DummyScoreProducers.h Normal file
View File

@ -0,0 +1,70 @@
// $Id: DummyScoreProducers.h 2939 2010-02-24 11:15:44Z jfouet $
#ifndef moses_DummyScoreProducers_h
#define moses_DummyScoreProducers_h
#include "FeatureFunction.h"
namespace Moses
{
class WordsRange;
/** Calculates Distortion scores
*/
class DistortionScoreProducer : public StatefulFeatureFunction {
public:
DistortionScoreProducer(ScoreIndexManager &scoreIndexManager);
float CalculateDistortionScore(const Hypothesis& hypo,
const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition) const;
size_t GetNumScoreComponents() const;
std::string GetScoreProducerDescription() const;
std::string GetScoreProducerWeightShortName() const;
size_t GetNumInputScores() const;
virtual const FFState* EmptyHypothesisState() const;
virtual FFState* Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
};
/** Doesn't do anything but provide a key into the global
* score array to store the word penalty in.
*/
class WordPenaltyProducer : public StatelessFeatureFunction {
public:
WordPenaltyProducer(ScoreIndexManager &scoreIndexManager);
size_t GetNumScoreComponents() const;
std::string GetScoreProducerDescription() const;
std::string GetScoreProducerWeightShortName() const;
size_t GetNumInputScores() const;
virtual void Evaluate(
const TargetPhrase& phrase,
ScoreComponentCollection* out) const;
};
/** unknown word penalty */
class UnknownWordPenaltyProducer : public StatelessFeatureFunction {
public:
UnknownWordPenaltyProducer(ScoreIndexManager &scoreIndexManager);
size_t GetNumScoreComponents() const;
std::string GetScoreProducerDescription() const;
std::string GetScoreProducerWeightShortName() const;
size_t GetNumInputScores() const;
virtual bool ComputeValueInTranslationOption() const;
};
}
#endif

147
src/DynSAInclude/fdstream.h Normal file
View File

@ -0,0 +1,147 @@
/* Class modified by ADL for randlm namespace on Feb 15th, 2008.
*
* The following code declares classes to read from and write to
* file descriptore or file handles.
*
* See
* http://www.josuttis.com/cppcode
* for details and the latest version.
*
* - open:
* - integrating BUFSIZ on some systems?
* - optimized reading of multiple characters
* - stream for reading AND writing
* - i18n
*
* (C) Copyright Nicolai M. Josuttis 2001.
* Permission to copy, use, modify, sell and distribute this software
* is granted provided this copyright notice appears in all copies.
* This software is provided "as is" without express or implied
* warranty, and with no claim as to its suitability for any purpose.
*
* Version: Jul 28, 2002
* History:
* Jul 28, 2002: bugfix memcpy() => memmove()
* fdinbuf::underflow(): cast for return statements
* Aug 05, 2001: first public version
*/
#ifndef moses_DynSAInclude_fdstream_h
#define moses_DynSAInclude_fdstream_h
#include <streambuf>
// for EOF:
#include <cstdio>
// for memmove():
#include <cstring>
// low-level read and write functions
#ifdef _MSC_VER
# include <io.h>
#else
# include <unistd.h>
//extern "C" {
// int write (int fd, const char* buf, int num);
// int read (int fd, char* buf, int num);
//}
#endif
// BEGIN namespace
//namespace randlm {
/************************************************************
* fdstreambuf
* - a stream that reads on a file descriptor
************************************************************/
class fdstreambuf : public std::streambuf {
protected:
int fd; // file descriptor
protected:
/* data buffer:
* - at most, pbSize characters in putback area plus
* - at most, bufSize characters in ordinary read buffer
*/
static const int pbSize = 4; // size of putback area
static const int bufSize = 1024; // size of the data buffer
char buffer[bufSize+pbSize]; // data buffer
public:
/* constructor
* - initialize file descriptor
* - initialize empty data buffer
* - no putback area
* => force underflow()
*/
fdstreambuf (int _fd) : fd(_fd) {
setg (buffer+pbSize, // beginning of putback area
buffer+pbSize, // read position
buffer+pbSize); // end position
}
protected:
// insert new characters into the buffer
virtual int_type underflow () {
#ifndef _MSC_VER
using std::memmove;
#endif
// is read position before end of buffer?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most size of putback area
*/
int numPutback;
numPutback = gptr() - eback();
if (numPutback > pbSize) {
numPutback = pbSize;
}
/* copy up to pbSize characters previously read into
* the putback area
*/
memmove (buffer+(pbSize-numPutback), gptr()-numPutback,
numPutback);
// read at most bufSize new characters
int num;
num = read (fd, buffer+pbSize, bufSize);
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset buffer pointers
setg (buffer+(pbSize-numPutback), // beginning of putback area
buffer+pbSize, // read position
buffer+pbSize+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
}
// write one character
virtual int_type overflow (int_type c) {
if (c != EOF) {
char z = c;
if (write (fd, &z, 1) != 1) {
return EOF;
}
}
return c;
}
// write multiple characters
virtual
std::streamsize xsputn (const char* s,
std::streamsize num) {
return write(fd,s,num);
}
};
//} // END namespace
#endif

160
src/DynSAInclude/file.cpp Normal file
View File

@ -0,0 +1,160 @@
#include "file.h"
namespace Moses {
// FileHandler class
const std::string FileHandler::kStdInDescriptor = "___stdin___";
const std::string FileHandler::kStdOutDescriptor = "___stdout___";
// compression commands
const FileExtension FileHandler::kGzipped = ".gz";
const FileExtension FileHandler::kBzipped2 = ".bz2";
const std::string FileHandler::kCatCommand = "cat";
const std::string FileHandler::kGzipCommand = "gzip -f";
const std::string FileHandler::kGunzipCommand = "gunzip -f";
const std::string FileHandler::kBzip2Command = "bzip2 -f";
const std::string FileHandler::kBunzip2Command = "bunzip2 -f";
FileHandler::FileHandler(const std::string & path, std::ios_base::openmode flags, bool checkExists)
: std::fstream(NULL), path_(path), flags_(flags), buffer_(NULL), fp_(NULL) {
if( !(flags^(std::ios::in|std::ios::out)) ) {
fprintf(stderr, "ERROR: FileHandler does not support bidirectional files (%s).\n", path_.c_str());
exit(EXIT_FAILURE);
}
else
assert(setStreamBuffer(flags & std::ios::in));
this->precision(32);
}
FileHandler::~FileHandler() {
if( fp_ != 0 )
pclose(fp_);
if( path_ != FileHandler::kStdInDescriptor &&
path_ != FileHandler::kStdOutDescriptor )
delete buffer_;
if( this->is_open() )
this->close();
}
fdstreambuf * FileHandler::openCompressedFile(const char * cmd) {
//bool isInput = (flags_ & std::ios::in);
//open pipe to file with compression/decompression command
const char * p_type = (flags_ & std::ios::in ? "r" : "w");
fp_ = popen(cmd, p_type);
if( fp_ == NULL ) {
//fprintf(stderr, "ERROR:Failed to open compressed file at %s\n", path_.c_str());
perror("openCompressedFile: ");
exit(EXIT_FAILURE);
}
//open streambuf with file descriptor
return new fdstreambuf(fileno(fp_));
}
bool FileHandler::setStreamBuffer(bool checkExists) {
// redirect stdin or stdout if necesary
if (path_ == FileHandler::kStdInDescriptor) {
assert(flags_ & std::ios::in);
std::streambuf* sb = std::cin.rdbuf();
buffer_ = sb;
} else if (path_ == FileHandler::kStdOutDescriptor) {
assert(flags_ & std::ios::out);
std::streambuf* sb = std::cout.rdbuf();
buffer_ = sb;
} else {
// real file
if( checkExists && ! fileExists() ) {
fprintf(stderr, "ERROR: Failed to find file at %s\n", path_.c_str());
exit(EXIT_FAILURE);
}
std::string cmd = "";
if( isCompressedFile(cmd) && (! cmd.empty()) ) {
buffer_ = openCompressedFile(cmd.c_str());
} else {
// open underlying filebuf
std::filebuf* fb = new std::filebuf();
fb->open(path_.c_str(), flags_);
buffer_ = fb;
}
}
if (!buffer_) {
fprintf(stderr, "ERROR:Failed to open file at %s\n", path_.c_str());
exit(EXIT_FAILURE);
}
this->init(buffer_);
return true;
}
/*
* Checks for compression via file extension. Currently checks for
* ".gz" and ".bz2".
*/
bool FileHandler::isCompressedFile(std::string & cmd)
{
bool compressed = false, isInput = (flags_ & std::ios::in);
cmd = "";
unsigned int len = path_.size();
if( len > kGzipped.size()
&& path_.find(kGzipped) == len - kGzipped.size()) {
//gzip file command to compress or decompress
compressed = true;
// cmd = (isInput ? "exec gunzip -cf " : "exec gzip -c > ") + path_;
cmd = (isInput ? "exec " + kGunzipCommand + "c "
: "exec " + kGzipCommand + "c > ") + path_;
} else if( len > kBzipped2.size() &&
path_.find(kBzipped2) == len - kBzipped2.size()) {
//do bzipped2 file command
compressed = true;
cmd = (isInput ? "exec " + kBunzip2Command + "c "
: "exec " + kBzip2Command + "c > ") + path_;
}
return compressed;
}
bool FileHandler::fileExists() {
bool exists = false;
struct stat f_info;
if( stat(path_.c_str(), &f_info) == 0 ) //if stat() returns no errors
exists = true;
return( exists );
}
// static method used during preprocessing compressed files without
// opening fstream objects.
bool FileHandler::getCompressionCmds(const std::string & filepath, std::string & compressionCmd,
std::string & decompressionCmd,
std::string & compressionSuffix) {
// determine what compression and decompression cmds are suitable from filepath
compressionCmd = kCatCommand;
decompressionCmd = kCatCommand;
if (filepath.length() > kGzipped.size() &&
filepath.find(kGzipped) == filepath.length()
- kGzipped.length()) {
compressionCmd = kGzipCommand;
decompressionCmd = kGunzipCommand;
compressionSuffix = kGzipped;
} else if (filepath.length() > kBzipped2.size() &&
filepath.find(kBzipped2) == filepath.length()
- kBzipped2.length() ) {
compressionCmd = kBzip2Command;
decompressionCmd = kBunzip2Command;
compressionSuffix = kBzipped2;;
}
return (compressionCmd != kCatCommand && decompressionCmd != kCatCommand);
}
bool FileHandler::reset() {
// move to beginning of file
if (fp_ != 0) {
//can't seek on a pipe so reopen
pclose(fp_);
std::string cmd = "";
if (isCompressedFile(cmd) && ! cmd.empty())
buffer_ = openCompressedFile(cmd.c_str());
//reinitialize
this->init(buffer_);
}
else
buffer_->pubseekoff(0, std::ios_base::beg); //sets both get and put pointers to beginning of stream
return true;
}
} //end namespace

61
src/DynSAInclude/file.h Normal file
View File

@ -0,0 +1,61 @@
#ifndef moses_File_h
#define moses_File_h
#include <iostream>
#include <fstream>
#include <cstdio>
#include <cstdlib>
#include <sys/stat.h>
#include <string>
#include <cassert>
#include "fdstream.h"
#include "utils.h"
namespace Moses {
typedef std::string FileExtension;
class FileHandler: public std::fstream {
public:
// descriptors for stdin and stdout
static const std::string kStdInDescriptor; // file name for std::cin
static const std::string kStdOutDescriptor; // file name for std::cout
// compression commands
static const std::string kCatCommand; // i.e. no compression
static const std::string kGzipCommand; // gzip -f
static const std::string kGunzipCommand; // gunzip -f
static const std::string kBzip2Command; // bzip2 -f
static const std::string kBunzip2Command; // bunzip2 -f
// open file or wrap stdin or stdout
FileHandler(const std::string & path,
std::ios_base::openmode flags = std::ios::in,
bool checkExists = true);
~FileHandler();
// file utilities
static bool getCompressionCmds(const std::string & filepath,
std::string & compressionCmd,
std::string & decompressionCmd,
std::string & compressionSuffix);
// data accessors
std::string getPath() { return path_; }
std::ios_base::openmode getFlags() { return flags_; }
bool isStdIn() { return path_ == FileHandler::kStdInDescriptor; }
bool isStdOut() { return path_ == FileHandler::kStdOutDescriptor; }
bool reset();
protected:
static const FileExtension kGzipped;
static const FileExtension kBzipped2;
bool fileExists();
bool setStreamBuffer(bool checkExists);
bool isCompressedFile(std::string & cmd);
fdstreambuf* openCompressedFile(const char* cmd);
std::string path_; // file path
std::ios_base::openmode flags_; // open flags
std::streambuf* buffer_; // buffer to either gzipped or standard data
std::FILE* fp_; //file pointer to handle pipe data
};
} // end namespace
#endif

32
src/DynSAInclude/types.h Normal file
View File

@ -0,0 +1,32 @@
#ifndef moses_DynSAInclude_types_h
#define moses_DynSAInclude_types_h
#include <iostream>
#include <map>
#include <set>
#include <vector>
#include <typeinfo>
#include <stdint.h>
#define iterate(c, i) for(typeof(c.begin()) i = c.begin(); i != c.end(); ++i)
#define piterate(c, i) for(typeof(c->begin()) i = c->begin(); i != c->end(); ++i)
#define THREADED false
#define THREAD_MAX 2
#define MAX_NGRAM_ORDER 8
#define MAX_STR_LEN 300
#define PRIME 8589935681ULL
#define MAX_HASH_FUNCS 1000
//#define PRIME 409
using std::string;
using std::cout;
using std::cerr;
using std::endl;
//typedefs for projects
typedef std::string word_t; // word as string
typedef unsigned int wordID_t; // word mapped to integer
typedef std::string date_t; // a date marker
typedef unsigned int count_t; // for 64-bit to 32-bit compatibility
#endif

81
src/DynSAInclude/utils.h Normal file
View File

@ -0,0 +1,81 @@
#ifndef moses_DynSAInclude_utils_h
#define moses_DynSAInclude_utils_h
#include <cstdlib>
#include <vector>
#include <string>
#include <sstream>
#include <cctype>
#include <cmath>
#include <cstring>
class Utils {
public:
static void trim(std::string& str, const std::string dropChars = " \t\n\r") {
str.erase(str.find_last_not_of(dropChars)+1);
str.erase(0, str.find_first_not_of(dropChars));
}
static void rtrim(std::string& str, const std::string dropChars = " \t\n\r") {
str.erase(str.find_last_not_of(dropChars)+1);
}
static void ltrim(std::string& str, const std::string dropChars = " \t\n\r") {
str.erase(0, str.find_first_not_of(dropChars));
}
static std::string IntToStr(int integer) {
std::ostringstream stream;
stream << integer;
return stream.str();
}
static int splitToStr(const char * str,
std::vector<std::string> & items,
const char * delm = "\t") {
char * buff = const_cast<char *>(str);
items.clear();
char * pch = strtok(buff, delm);
while( pch != NULL ) {
items.push_back(pch);
pch = strtok(NULL, delm);
}
return items.size();
}
static int splitToStr(std::string buff,
std::vector<std::string> & items,
std::string delm = "\t") {
std::string cp = buff.substr();
return splitToStr(cp.c_str(), items, delm.c_str());
}
static int splitToInt(std::string buff, std::vector<int>& items,
std::string delm = ",") {
items.clear();
std::vector<std::string> tmpVector(0);
int i = 0;
i = splitToStr(buff.c_str(), tmpVector, delm.c_str());
if( i > 0 )
for( int j = 0; j < i; j++ )
items.push_back(atoi(tmpVector[j].c_str()));
return i;
}
static void strToLowercase(std::string& str) {
for(unsigned i=0; i < str.length(); i++) {
str[i] = tolower(str[i]);
}
}
// TODO: interface with decent PRG
template<typename T>
static T rand(T mod_bnd = 0) {
T random = 0;
if(sizeof(T) <= 4) {
random = static_cast<T>(std::rand());
}
else if(sizeof(T) == 8) {
random = static_cast<T>(std::rand());
random <<= 31; random <<= 1;
random |= static_cast<T>(std::rand());
}
if(mod_bnd != 0)
return random % mod_bnd;
else return random;
}
};
#endif

View File

@ -0,0 +1,93 @@
#include <sstream>
#include "vocab.h"
namespace Moses {
// Vocab class
const wordID_t Vocab::kOOVWordID;
const wordID_t Vocab::kBOSWordID;
const word_t Vocab::kBOS = "<s>";
const word_t Vocab::kEOS = "</s>";
const word_t Vocab::kOOVWord = "<unk>";
wordID_t Vocab::getWordID(const word_t& word) {
// get id and possibly add to vocab
if (words2ids_.find(word) == words2ids_.end())
if (!closed_) {
wordID_t id = words2ids_.size() + 1;
words2ids_[word] = id; // size() returns size AFTER insertion of word
ids2words_[id] = word; // so size() is the same here ...
}
else {
return Vocab::kOOVWordID;
}
wordID_t id = words2ids_[word];
return id;
}
word_t Vocab::getWord(wordID_t id) {
// get word string given id
return (ids2words_.find(id) == ids2words_.end()) ? Vocab::kOOVWord : ids2words_[id];
}
bool Vocab::inVocab(wordID_t id) {
return ids2words_.find(id) != ids2words_.end();
}
bool Vocab::inVocab(const word_t & word) {
return words2ids_.find(word) != words2ids_.end();
}
bool Vocab::save(const std::string & vocab_path) {
// save vocab as id -> word
FileHandler vcbout(vocab_path, std::ios::out);
return save(&vcbout);
}
bool Vocab::save(FileHandler* vcbout) {
// then each vcb entry
*vcbout << ids2words_.size() << "\n";
iterate(ids2words_, iter)
*vcbout << iter->second << "\t" << iter->first << "\n";
return true;
}
bool Vocab::load(const std::string & vocab_path, bool closed) {
FileHandler vcbin(vocab_path, std::ios::in);
std::cerr << "Loading vocab from " << vocab_path << std::endl;
return load(&vcbin, closed);
}
bool Vocab::load(FileHandler* vcbin, bool closed) {
// load vocab id -> word mapping
words2ids_.clear(); // reset mapping
ids2words_.clear();
std::string line;
word_t word;
wordID_t id;
assert(getline(*vcbin, line));
std::istringstream first(line.c_str());
uint32_t vcbsize(0);
first >> vcbsize;
uint32_t loadedsize = 0;
while (loadedsize++ < vcbsize && getline(*vcbin, line)) {
std::istringstream entry(line.c_str());
entry >> word;
entry >> id;
// may be no id (i.e. file may just be a word list)
if (id == 0 && word != Vocab::kOOVWord)
id = ids2words_.size() + 1; // assign ids sequentially starting from 1
assert(ids2words_.count(id) == 0 && words2ids_.count(word) == 0);
ids2words_[id] = word;
words2ids_[word] = id;
}
closed_ = closed; // once loaded fix vocab ?
std::cerr << "Loaded vocab with " << ids2words_.size() << " words." << std::endl;
return true;
}
void Vocab::printVocab() {
iterate(ids2words_, iter)
std::cerr << iter->second << "\t" << iter->first << "\n";
iterate(words2ids_, iter)
std::cerr << iter->second << "\t" << iter->first << "\n";
}
} //end namespace

64
src/DynSAInclude/vocab.h Normal file
View File

@ -0,0 +1,64 @@
#ifndef moses_DynSAInclude_vocab_h
#define moses_DynSAInclude_vocab_h
#include <map>
#include <string>
#include "types.h"
#include "file.h"
#include "utils.h"
namespace Moses {
// Vocab maps between strings and uint32 ids.
class Vocab {
public:
typedef std::map<word_t, wordID_t> Word2Id;
typedef std::map<wordID_t, word_t> Id2Word;
static const wordID_t kOOVWordID = 0; // out of vocabulary word id
static const wordID_t kBOSWordID = 1;
static const word_t kBOS; // beginning of sentence marker
static const word_t kEOS; // end of sentence marker
static const word_t kOOVWord; // <unk>
Vocab(bool sntMarkers = true):closed_(false) {
if(sntMarkers) {
getWordID(kBOS); // added in case not observed in corpus
getWordID(kEOS);
}
}
// if no file then must allow new words
// specify whether more words can be added via 'closed'
// assume that if a vocab is loaded from file then it should be closed.
Vocab(const std::string & vocab_path, bool closed = true) {
assert(load(vocab_path, closed));
}
Vocab(FileHandler* fin, bool closed = true) {
assert(load(fin, closed));
}
~Vocab() {}
wordID_t getWordID(const word_t & word);
word_t getWord(wordID_t id);
bool inVocab(wordID_t id);
bool inVocab(const word_t & word);
uint32_t size() { return words2ids_.size(); }
void makeClosed() { closed_ = true; }
void makeOpen() { closed_ = false; }
bool isClosed() { return closed_; }
bool save(const std::string & vocab_path);
bool save(FileHandler* fout);
bool load(const std::string & vocab_path, bool closed = true);
bool load(FileHandler* fin, bool closed = true);
void printVocab();
Word2Id::const_iterator vocabStart() {
return words2ids_.begin();
}
Word2Id::const_iterator vocabEnd() {
return words2ids_.end();
}
private:
Word2Id words2ids_; // map from strings to word ids
Id2Word ids2words_; // map from ids to strings
bool closed_; // can more words be added
};
}
#endif

237
src/DynSuffixArray.cpp Normal file
View File

@ -0,0 +1,237 @@
#include "DynSuffixArray.h"
#include <iostream>
namespace Moses {
DynSuffixArray::DynSuffixArray() {
SA_ = new vuint_t();
ISA_ = new vuint_t();
F_ = new vuint_t();
L_ = new vuint_t();
std::cerr << "DYNAMIC SUFFIX ARRAY CLASS INSTANTIATED" << std::endl;
}
DynSuffixArray::~DynSuffixArray() {
delete SA_;
delete ISA_;
delete F_;
delete L_;
}
DynSuffixArray::DynSuffixArray(vuint_t* crp) {
// make native int array and pass to SA builder
corpus_ = crp;
int size = corpus_->size();
int* tmpArr = new int[size];
for(int i=0 ; i < size; ++i) tmpArr[i] = i;
qsort(tmpArr, 0, size-1);
SA_ = new vuint_t(tmpArr, tmpArr + size);
//std::cerr << "printing SA " << std::endl;
//for(int i=0; i < size; ++i) std::cerr << SA_->at(i) << std::endl;
delete[] tmpArr;
std::cerr << "DYNAMIC SUFFIX ARRAY CLASS INSTANTIATED WITH SIZE " << size << std::endl;
buildAuxArrays();
//printAuxArrays();
}
void DynSuffixArray::buildAuxArrays() {
int size = SA_->size();
ISA_ = new vuint_t(size);
F_ = new vuint_t(size);
L_ = new vuint_t(size);
for(int i=0; i < size; ++i) {
ISA_->at(SA_->at(i)) = i;
//(*ISA_)[(*SA_)[i]] = i;
(*F_)[i] = (*corpus_)[SA_->at(i)];
(*L_)[i] = (*corpus_)[(SA_->at(i) == 0 ? size-1 : SA_->at(i)-1)];
}
}
int DynSuffixArray::rank(unsigned word, unsigned idx) {
/* use Gerlach's code to make rank faster */
// the number of word in L[0..i]
int r(0);
for(unsigned i=0; i < idx; ++i)
if(L_->at(i) == word) ++r;
return r;
}
/* count function should be implemented
* with binary search over suffix array!! */
int DynSuffixArray::F_firstIdx(unsigned word) {
// return index of first row where word is found in F_
int low = std::lower_bound(F_->begin(), F_->end(), word) - F_->begin();
if(F_->at(low) == word) return low;
else return -1;
}
/* uses rank() and c() to obtain the LF function */
int DynSuffixArray::LF(unsigned L_idx) {
int fIdx(-1);
unsigned word = L_->at(L_idx);
if((fIdx = F_firstIdx(word)) != -1)
return fIdx + rank(word, L_idx);
}
void DynSuffixArray::insertFactor(vuint_t* newSent, unsigned newIndex) {
// for sentences
//stages 1, 2, 4 stay same from 1char case
//(use last word of new text in step 2 and save Ltmp until last insert?)
//stage 3...all words of new sentence are inserted backwards
// stage 2: k=ISA[newIndex], tmp= L[k], L[k] = newChar
assert(newIndex <= SA_->size());
int k(-1), kprime(-1);
k = (newIndex < SA_->size() ? ISA_->at(newIndex) : ISA_->at(0)); // k is now index of the cycle that starts at newindex
int true_pos = LF(k); // track cycle shift (newIndex - 1)
int Ltmp = L_->at(k);
L_->at(k) = (*newSent)[newSent->size()-1]; // cycle k now ends with correct word
for(int j = newSent->size()-1; j > -1; --j) {
kprime = LF(k); // find cycle that starts with (newindex - 1)
//kprime += ((L_[k] == Ltmp) && (k > isa[k]) ? 1 : 0); // yada yada
// only terminal char can be 0 so add new vocab at end
kprime = (kprime > 0 ? kprime : SA_->size());
true_pos += (kprime <= true_pos ? 1 : 0); // track changes
// insert everything
F_->insert(F_->begin() + kprime, (*newSent)[j]);
int theLWord = (j == 0 ? Ltmp : (*newSent)[j-1]);
L_->insert(L_->begin() + kprime, theLWord);
piterate(SA_, itr)
if(*itr >= newIndex) ++(*itr);
SA_->insert(SA_->begin() + kprime, newIndex);
piterate(ISA_, itr)
if(*itr >= kprime) ++(*itr);
ISA_->insert(ISA_->begin() + newIndex, kprime);
k = kprime;
}
// Begin stage 4
reorder(true_pos, LF(kprime)); // actual position vs computed position of cycle (newIndex-1)
}
void DynSuffixArray::reorder(unsigned j, unsigned jprime) {
printf("j=%d\tj'=%d\n", j, jprime);
while(j != jprime) {
printf("j=%d\tj'=%d\n", j, jprime);
int tmp, isaIdx(-1);
int new_j = LF(j);
// for SA, L, and F, the element at pos j is moved to j'
tmp = L_->at(j); // L
L_->at(j) = L_->at(jprime);
L_->at(jprime) = tmp;
tmp = SA_->at(j); // SA
SA_->at(j) = SA_->at(jprime);
SA_->at(jprime) = tmp;
// all ISA values between (j...j'] decremented
for(int i = 0; i < ISA_->size(); ++i) {
if((ISA_->at(i) == j) && (isaIdx == -1))
isaIdx = i; // store index of ISA[i] = j
if((ISA_->at(i) > j) && (ISA_->at(i) <= jprime)) --(*ISA_)[i];
}
// replace j with j' in ISA
//isa[isaIdx] = jprime;
ISA_->at(isaIdx) = jprime;
j = new_j;
jprime = LF(jprime);
}
}
void DynSuffixArray::deleteFactor(unsigned index, unsigned num2del) {
int ltmp = L_->at(ISA_->at(index));
int true_pos = LF(ISA_->at(index)); // track cycle shift (newIndex - 1)
for(int q = 0; q < num2del; ++q) {
int row = ISA_->at(index); // gives the position of index in SA and F_
std::cerr << "row = " << row << std::endl;
std::cerr << "SA[r]/index = " << SA_->at(row) << "/" << index << std::endl;
true_pos -= (row <= true_pos ? 1 : 0); // track changes
L_->erase(L_->begin() + row);
F_->erase(F_->begin() + row);
ISA_->erase(ISA_->begin() + index); // order is important
piterate(ISA_, itr)
if(*itr > row) --(*itr);
SA_->erase(SA_->begin() + row);
piterate(SA_, itr)
if(*itr > index) --(*itr);
}
L_->at(ISA_->at(index))= ltmp;
reorder(LF(ISA_->at(index)), true_pos);
printAuxArrays();
}
void DynSuffixArray::substituteFactor(vuint_t* newSents, unsigned newIndex) {
std::cerr << "NEEDS TO IMPELEMNT SUBSITITUTE FACTOR\n";
return;
}
bool DynSuffixArray::getCorpusIndex(const vuint_t* phrase, vuint_t* indices) {
pair<vuint_t::iterator,vuint_t::iterator> bounds;
indices->clear();
int phrasesize = phrase->size();
// find lower and upper bounds on phrase[0]
bounds = std::equal_range(F_->begin(), F_->end(), phrase->at(0));
// bounds holds first and (last + 1) index of phrase[0] in SA_
int lwrBnd = int(bounds.first - F_->begin());
int uprBnd = int(bounds.second - F_->begin());
if(uprBnd - lwrBnd == 0) return false; // not found
if(phrasesize == 1) {
for(int i=lwrBnd; i < uprBnd; ++i) {
indices->push_back(SA_->at(i));
}
return (indices->size() > 0);
}
//find longer phrases if they exist
for(int i = lwrBnd; i < uprBnd; ++i) {
int crpIdx = SA_->at(i);
if((crpIdx + phrasesize) >= corpus_->size()) continue; // past end of corpus
for(int pos = 1; pos < phrasesize; ++pos) { // for all following words
if(corpus_->at(crpIdx + pos) != phrase->at(pos)) { // if word doesn't match
if(indices->size() > 0) i = uprBnd; // past the phrases since SA is ordered
break;
}
else if(pos == phrasesize-1) { // found phrase
indices->push_back(crpIdx + pos); // store rigthmost index of phrase
}
}
}
//cerr << "Total count of phrase = " << indices->size() << endl;
return (indices->size() > 0);
}
void DynSuffixArray::save(FILE* fout) {
fWriteVector(fout, *SA_);
}
void DynSuffixArray::load(FILE* fin) {
fReadVector(fin, *SA_);
}
int DynSuffixArray::compare(int pos1, int pos2, int max) {
for (int i=0; i < max; ++i) {
if((pos1 + i < corpus_->size()) && (pos2 + i >= corpus_->size()))
return 1;
if((pos2 + i < corpus_->size()) && (pos1 + i >= corpus_->size()))
return -1;
int diff = corpus_->at(pos1+i) - corpus_->at(pos2+i);
if(diff != 0) return diff;
}
return 0;
}
void DynSuffixArray::qsort(int* array, int begin, int end) {
if(end > begin)
{
int index;
{
index = begin + (rand() % (end - begin + 1));
int pivot = array[index];
{
int tmp = array[index];
array[index] = array[end];
array[end] = tmp;
}
for(int i=index=begin; i < end; ++i) {
if (compare(array[i], pivot, 20) <= 0) {
{
int tmp = array[index];
array[index] = array[i];
array[i] = tmp;
index++;
}
}
}
{
int tmp = array[index];
array[index] = array[end];
array[end] = tmp;
}
}
qsort(array, begin, index - 1);
qsort(array, index + 1, end);
}
}
} // end namespace

50
src/DynSuffixArray.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef moses_DynSuffixArray_h
#define moses_DynSuffixArray_h
#include <vector>
#include <set>
#include <algorithm>
#include <utility>
#include "Util.h"
#include "File.h"
#include "DynSAInclude/types.h"
namespace Moses {
using std::vector;
using std::pair;
typedef std::vector<unsigned> vuint_t;
class DynSuffixArray {
public:
DynSuffixArray();
DynSuffixArray(vuint_t*);
~DynSuffixArray();
bool getCorpusIndex(const vuint_t*, vuint_t*);
void load(FILE*);
void save(FILE*);
private:
vuint_t* SA_;
vuint_t* ISA_;
vuint_t* F_;
vuint_t* L_;
vuint_t* corpus_;
void buildAuxArrays();
void qsort(int* array, int begin, int end);
int compare(int, int, int);
void reorder(unsigned, unsigned);
void insertFactor(vuint_t*, unsigned);
void deleteFactor(unsigned, unsigned);
void substituteFactor(vuint_t*, unsigned);
int LF(unsigned);
int rank(unsigned, unsigned);
int F_firstIdx(unsigned);
void printAuxArrays() {
std::cerr << "SA\tISA\tF_\tL_\n";
for(int i=0; i < SA_->size(); ++i)
std::cerr << SA_->at(i) << "\t" << ISA_->at(i) << "\t" << F_->at(i) << "\t" << L_->at(i) << std::endl;
}
};
} //end namespace
#endif

8
src/FFState.cpp Normal file
View File

@ -0,0 +1,8 @@
#include "FFState.h"
namespace Moses {
FFState::~FFState() {}
}

13
src/FFState.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef moses_FFState_h
#define moses_FFState_h
namespace Moses {
class FFState {
public:
virtual ~FFState();
virtual int Compare(const FFState& other) const = 0;
};
}
#endif

53
src/Factor.cpp Normal file
View File

@ -0,0 +1,53 @@
// $Id: Factor.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "Factor.h"
using namespace std;
namespace Moses
{
Factor::Factor(FactorDirection direction, FactorType factorType, const std::string *factorString, size_t id)
://m_direction(direction)
//,m_factorType(factorType)
m_ptrString(factorString)
,m_id(id)
{}
Factor::Factor(FactorDirection direction, FactorType factorType, const std::string *factorString)
//:m_direction(direction)
//,m_factorType(factorType)
:m_ptrString(factorString)
,m_id(NOT_FOUND)
{}
TO_STRING_BODY(Factor)
// friend
ostream& operator<<(ostream& out, const Factor& factor)
{
out << factor.GetString();
return out;
}
}

147
src/Factor.h Normal file
View File

@ -0,0 +1,147 @@
// $Id: Factor.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Factor_h
#define moses_Factor_h
#include <sstream>
#include <iostream>
#include <list>
#include <vector>
#include <map>
#include <string>
#include "TypeDef.h"
#include "Util.h"
#include "hash.h"
namespace Moses
{
class FactorCollection;
/** Represents a factor (word, POS, etc) on the E or F side
*
* A Factor object is a tuple of direction (Input or Output,
* corresponding to French or English), a type (surface form,
* POS, stem, etc), and the value of the factor.
*
* @TODO I find this design problematic- essentially, a factor should
* just be a value type and the factor type and "direction"
* should be the keys in a larger identification system that
* find instances of specific factors.
*
*/
class Factor
{
friend std::ostream& operator<<(std::ostream&, const Factor&);
// only these classes are allowed to instantiate this class
friend class FactorCollection;
protected:
//FactorDirection m_direction;
//FactorType m_factorType;
const std::string *m_ptrString;
const size_t m_id;
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
Factor(FactorDirection direction, FactorType factorType, const std::string *factorString, size_t id);
//! no id set. do not used to create new factors, only used for seeing if factor exists
Factor(FactorDirection direction, FactorType factorType, const std::string *factorString);
public:
//! returns whether this factor is part of the source ('Input') or target ('Output') language
//inline FactorDirection GetFactorDirection() const
//{
// return m_direction;
//}
//! index, FactorType. For example, 0=surface, 1=POS. The actual mapping is user defined
//inline FactorType GetFactorType() const
//{
// return m_factorType;
//}
//! original string representation of the factor
inline const std::string &GetString() const
{
return *m_ptrString;
}
//! contiguous ID
inline size_t GetId() const
{
return m_id;
}
/*
//! Alternative comparison between factors. Not yet used
inline unsigned int GetHash() const
{
unsigned int h=quick_hash((const char*)&m_direction, sizeof(FactorDirection), 0xc7e7f2fd);
h=quick_hash((const char*)&m_factorType, sizeof(FactorType), h);
h=quick_hash((const char*)&m_ptrString, sizeof(const std::string *), h);
return h;
}
*/
/** transitive comparison between 2 factors.
* -1 = less than
* +1 = more than
* 0 = same
* Used by operator< & operator==, as well as other classes
*/
inline int Compare(const Factor &compare) const
{
if (m_ptrString < compare.m_ptrString)
return -1;
if (m_ptrString > compare.m_ptrString)
return 1;
/*
if (m_direction < compare.m_direction)
return -1;
if (m_direction > compare.m_direction)
return 1;
if (m_factorType < compare.m_factorType)
return -1;
if (m_factorType > compare.m_factorType)
return 1;
*/
return 0;
}
//! transitive comparison used for adding objects into FactorCollection
inline bool operator<(const Factor &compare) const
{
return Compare(compare) < 0;
}
// quick equality comparison. Not used
inline bool operator==(const Factor &compare) const
{
return this == &compare;
}
TO_STRING();
};
}
#endif

117
src/FactorCollection.cpp Normal file
View File

@ -0,0 +1,117 @@
// $Id: FactorCollection.cpp 2477 2009-08-07 16:47:54Z bhaddow $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include "FactorCollection.h"
#include "LanguageModel.h"
#include "Util.h"
using namespace std;
namespace Moses
{
FactorCollection FactorCollection::s_instance;
void FactorCollection::LoadVocab(FactorDirection direction, FactorType factorType, const string &filePath)
{
ifstream inFile(filePath.c_str());
string line;
#ifdef WITH_THREADS
boost::upgrade_lock<boost::shared_mutex> lock(m_accessLock);
boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(lock);
#endif
while( !getline(inFile, line, '\n').eof())
{
vector<string> token = Tokenize( line );
if (token.size() < 2)
{
continue;
}
// looks like good line
AddFactor(direction, factorType, token[1]);
}
}
bool FactorCollection::Exists(FactorDirection direction, FactorType factorType, const string &factorString)
{
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> lock(m_accessLock);
#endif
// find string id
const string *ptrString=&(*m_factorStringCollection.insert(factorString).first);
FactorSet::const_iterator iterFactor;
Factor search(direction, factorType, ptrString); // id not used for searching
iterFactor = m_collection.find(search);
return iterFactor != m_collection.end();
}
const Factor *FactorCollection::AddFactor(FactorDirection direction
, FactorType factorType
, const string &factorString)
{
#ifdef WITH_THREADS
boost::upgrade_lock<boost::shared_mutex> lock(m_accessLock);
boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(lock);
#endif
// find string id
const string *ptrString=&(*m_factorStringCollection.insert(factorString).first);
pair<FactorSet::iterator, bool> ret = m_collection.insert( Factor(direction, factorType, ptrString, m_factorId) );
if (ret.second)
++m_factorId; // new factor, make sure next new factor has diffrernt id
const Factor *factor = &(*ret.first);
return factor;
}
FactorCollection::~FactorCollection()
{
//FactorSet::iterator iter;
//for (iter = m_collection.begin() ; iter != m_collection.end() ; iter++)
//{
// delete (*iter);
//}
}
TO_STRING_BODY(FactorCollection);
// friend
ostream& operator<<(ostream& out, const FactorCollection& factorCollection)
{
FactorSet::const_iterator iterFactor;
for (iterFactor = factorCollection.m_collection.begin() ; iterFactor != factorCollection.m_collection.end() ; ++iterFactor)
{
const Factor &factor = *iterFactor;
out << factor;
}
return out;
}
}

91
src/FactorCollection.h Normal file
View File

@ -0,0 +1,91 @@
// $Id: FactorCollection.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_FactorCollection_h
#define moses_FactorCollection_h
#include <set>
#include <string>
#ifdef WITH_THREADS
#include <boost/thread/shared_mutex.hpp>
#endif
#include "Factor.h"
namespace Moses
{
class LanguageModel;
typedef std::set<Factor> FactorSet;
typedef std::set<std::string> StringSet;
/** collection of factors
*
* All Factors in moses are accessed and created by a FactorCollection.
* By enforcing this strict creation processes (ie, forbidding factors
* from being created on the stack, etc), their memory addresses can
* be used as keys to uniquely identify them.
* Only 1 FactorCollection object should be created.
*/
class FactorCollection
{
friend std::ostream& operator<<(std::ostream&, const FactorCollection&);
protected:
static FactorCollection s_instance;
#ifdef WITH_THREADS
//reader-writer lock
boost::shared_mutex m_accessLock;
#endif
size_t m_factorId; /**< unique, contiguous ids, starting from 0, for each factor */
FactorSet m_collection; /**< collection of all factors */
StringSet m_factorStringCollection; /**< collection of unique string used by factors */
//! constructor. only the 1 static variable can be created
FactorCollection()
:m_factorId(0)
{}
public:
static FactorCollection& Instance() { return s_instance; }
//! Destructor
~FactorCollection();
//! Test to see whether a factor exists
bool Exists(FactorDirection direction, FactorType factorType, const std::string &factorString);
/** returns a factor with the same direction, factorType and factorString.
* If a factor already exist in the collection, return the existing factor, if not create a new 1
*/
const Factor *AddFactor(FactorDirection direction, FactorType factorType, const std::string &factorString);
//! Load list of factors. Deprecated
void LoadVocab(FactorDirection direction, FactorType factorType, const std::string &filePath);
TO_STRING();
};
}
#endif

59
src/FactorTypeSet.cpp Normal file
View File

@ -0,0 +1,59 @@
// $Id: FactorTypeSet.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "FactorTypeSet.h"
using namespace std;
namespace Moses
{
FactorMask::FactorMask(const vector<FactorType> &factors)
{
vector<FactorType>::const_iterator iter;
for (iter = factors.begin() ; iter != factors.end() ; ++iter)
{
this->set(*iter);
}
}
TO_STRING_BODY(FactorMask);
// friend
std::ostream& operator<<(std::ostream& out, const FactorMask& fm)
{
out << "FactorMask<";
bool first = true;
for (size_t currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++)
{
if (fm[currFactor])
{
if (first) { first = false; } else { out << ","; }
out << currFactor;
}
}
out << ">";
return out;
}
}

53
src/FactorTypeSet.h Normal file
View File

@ -0,0 +1,53 @@
// $Id: FactorTypeSet.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_FactorTypeSet_h
#define moses_FactorTypeSet_h
#include <iostream>
#include <bitset>
#include <vector>
#include "TypeDef.h"
#include "Util.h"
namespace Moses
{
/** set of unique FactorTypes. Used to store what factor types are used in phrase tables etc
*/
class FactorMask : public std::bitset<MAX_NUM_FACTORS>
{
friend std::ostream& operator<<(std::ostream&, const FactorMask&);
public:
//! construct object from list of FactorType.
explicit FactorMask(const std::vector<FactorType> &factors);
//! default constructor
inline FactorMask() {}
//! copy constructor
FactorMask(const std::bitset<MAX_NUM_FACTORS>& rhs) : std::bitset<MAX_NUM_FACTORS>(rhs) { }
TO_STRING();
};
}
#endif

22
src/FeatureFunction.cpp Normal file
View File

@ -0,0 +1,22 @@
#include "FeatureFunction.h"
#include <cassert>
namespace Moses {
FeatureFunction::~FeatureFunction() {}
bool StatelessFeatureFunction::IsStateless() const { return true; }
bool StatelessFeatureFunction::ComputeValueInTranslationOption() const {
return false;
}
void StatelessFeatureFunction::Evaluate(
const TargetPhrase& cur_hypo,
ScoreComponentCollection* accumulator) const {
assert(!"Please implement Evaluate or set ComputeValueInTranslationOption to true");
}
bool StatefulFeatureFunction::IsStateless() const { return false; }
}

64
src/FeatureFunction.h Normal file
View File

@ -0,0 +1,64 @@
#ifndef moses_FeatureFunction_h
#define moses_FeatureFunction_h
#include <vector>
#include "ScoreProducer.h"
namespace Moses {
class TargetPhrase;
class Hypothesis;
class FFState;
class ScoreComponentCollection;
class FeatureFunction: public ScoreProducer {
public:
virtual bool IsStateless() const = 0;
virtual ~FeatureFunction();
};
class StatelessFeatureFunction: public FeatureFunction {
public:
//! Evaluate for stateless feature functions. Implement this.
virtual void Evaluate(
const TargetPhrase& cur_hypo,
ScoreComponentCollection* accumulator) const;
// If true, this value is expected to be included in the
// ScoreBreakdown in the TranslationOption once it has been
// constructed.
// Default: true
virtual bool ComputeValueInTranslationOption() const;
bool IsStateless() const;
};
class StatefulFeatureFunction: public FeatureFunction {
public:
/**
* \brief This interface should be implemented.
* Notes: When evaluating the value of this feature function, you should avoid
* calling hypo.GetPrevHypo(). If you need something from the "previous"
* hypothesis, you should store it in an FFState object which will be passed
* in as prev_state. If you don't do this, you will get in trouble.
*/
virtual FFState* Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const = 0;
//! return the state associated with the empty hypothesis
virtual const FFState* EmptyHypothesisState() const = 0;
bool IsStateless() const;
};
}
#endif

4
src/File.cpp Normal file
View File

@ -0,0 +1,4 @@
#include "File.h"

122
src/File.h Normal file
View File

@ -0,0 +1,122 @@
// $Id: File.h 2939 2010-02-24 11:15:44Z jfouet $
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef moses_File_h
#define moses_File_h
#include <cstdio>
#include <iostream>
#include <vector>
#include <cassert>
#include "UserMessage.h"
#include "TypeDef.h"
#include "Util.h"
namespace Moses
{
#ifdef WIN32
#define OFF_T __int64
#define FTELLO(file) _ftelli64(file)
#define FSEEKO(file, offset, origin) _fseeki64(file, offset, origin)
#else
#define OFF_T off_t
#define FTELLO(f) ftello(f)
#define FSEEKO(file, offset, origin) fseeko(file, offset, origin)
#endif
static const OFF_T InvalidOffT=-1;
// WARNING:
// these functions work only for bitwise read/write-able types
template<typename T> inline size_t fWrite(FILE* f,const T& t) {
if(fwrite(&t,sizeof(t),1,f)!=1) {TRACE_ERR("ERROR:: fwrite!\n");abort();}
return sizeof(t);
}
template<typename T> inline void fRead(FILE* f,T& t) {
if(fread(&t,sizeof(t),1,f)!=1) {TRACE_ERR("ERROR: fread!\n");abort();}
}
template<typename T> inline size_t fWrite(FILE* f,const T* b,const T* e) {
UINT32 s=std::distance(b,e);size_t rv=fWrite(f,s);
if(fwrite(b,sizeof(T),s,f)!=s) {TRACE_ERR("ERROR: fwrite!\n");abort();}
return rv+sizeof(T)*s;
}
template<typename T> inline size_t fWrite(FILE* f,const T b,const T e) {
UINT32 s=std::distance(b,e);size_t rv=fWrite(f,s);
if(fwrite(&(*b),sizeof(T),s,f)!=s) {TRACE_ERR("ERROR: fwrite!\n");abort();}
return rv+sizeof(T)*s;
}
template<typename C> inline size_t fWriteVector(FILE* f,const C& v) {
UINT32 s=v.size();
size_t rv=fWrite(f,s);
if(fwrite(&v[0],sizeof(typename C::value_type),s,f)!=s) {TRACE_ERR("ERROR: fwrite!\n");abort();}
return rv+sizeof(typename C::value_type)*s;
}
template<typename C> inline void fReadVector(FILE* f, C& v) {
UINT32 s;fRead(f,s);
v.resize(s);
size_t r=fread(&(*v.begin()),sizeof(typename C::value_type),s,f);
if(r!=s) {TRACE_ERR("ERROR: freadVec! "<<r<<" "<<s<<"\n");abort();}
}
inline size_t fWriteString(FILE* f,const char* e, UINT32 s) {
size_t rv=fWrite(f,s);
if(fwrite(e,sizeof(char),s,f)!=s) {TRACE_ERR("ERROR:: fwrite!\n");abort();}
return rv+sizeof(char)*s;
}
inline void fReadString(FILE* f,std::string& e) {
UINT32 s;fRead(f,s);
char* a=new char[s+1];
if(fread(a,sizeof(char),s,f)!=s) {TRACE_ERR("ERROR: fread!\n");abort();}
a[s]='\0';
e.assign(a);
}
inline size_t fWriteStringVector(FILE* f,const std::vector<std::string>& v) {
UINT32 s=v.size();
size_t totrv=fWrite(f,s);
for (size_t i=0;i<s;i++){ totrv+=fWriteString(f,v.at(i).c_str(),v.at(i).size()); }
return totrv;
}
inline void fReadStringVector(FILE* f, std::vector<std::string>& v) {
UINT32 s;fRead(f,s);v.resize(s);
for (size_t i=0;i<s;i++){ fReadString(f,v.at(i)); }
}
inline OFF_T fTell(FILE* f) {return FTELLO(f);}
inline void fSeek(FILE* f,OFF_T o) {
if(FSEEKO(f,o,SEEK_SET)<0) {
TRACE_ERR("ERROR: could not fseeko position "<<o<<"\n");
if(o==InvalidOffT) TRACE_ERR("You tried to seek for 'InvalidOffT'!\n");
abort();
}
}
inline FILE* fOpen(const char* fn,const char* m) {
if(FILE* f=fopen(fn,m))
return f;
else {
UserMessage::Add(std::string("ERROR: could not open file ") + fn + " with mode " + m + "\n");
assert(false);
return NULL;
}
}
inline void fClose(FILE* f) {fclose(f);} // for consistent function names only
}
#endif

55
src/FilePtr.h Normal file
View File

@ -0,0 +1,55 @@
// $Id: FilePtr.h 2939 2010-02-24 11:15:44Z jfouet $
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef moses_FilePtr_h
#define moses_FilePtr_h
#include "File.h"
namespace Moses
{
// smart pointer for on-demand loading from file
// requirement: T has a constructor T(FILE*)
template<typename T> class FilePtr {
public:
typedef T* Ptr;
private:
FILE* f;
OFF_T pos;
mutable Ptr t;
public:
FilePtr(FILE* f_=0,OFF_T p=0) : f(f_),pos(p),t(0) {}
~FilePtr() {}
void set(FILE* f_,OFF_T p) {f=f_;pos=p;}
void free() {delete t; t=0;}
T& operator* () {load();return *t;}
Ptr operator->() {load();return t;}
operator Ptr () {load();return t;}
const T& operator* () const {load();return *t;}
Ptr operator->() const {load();return t;}
operator Ptr () const {load();return t;}
// direct access to pointer, use with care!
Ptr getPtr() {return t;}
Ptr getPtr() const {return t;}
operator bool() const {return (f && pos!=InvalidOffT);}
void load() const {
if(t) return;
if(f && pos!=InvalidOffT) {fSeek(f,pos); t=new T(f);}
}
};
}
#endif

34
src/FloydWarshall.cpp Normal file
View File

@ -0,0 +1,34 @@
#include <cassert>
#include <climits>
#include <vector>
#define MAX_DIST (INT_MAX / 2)
//#include "FloydWarshall.h"
// All-pairs shortest path algorithm
void floyd_warshall(const std::vector<std::vector<bool> >& edges, std::vector<std::vector<int> >& dist)
{
assert(edges.size() == edges.front().size());
dist.clear();
dist.resize(edges.size(), std::vector<int>(edges.size(), 0));
size_t num_edges = edges.size();
for (size_t i=0; i<num_edges; ++i) {
for (size_t j=0; j<num_edges; ++j) {
if (edges[i][j])
dist[i][j] = 1;
else
dist[i][j] = MAX_DIST;
if (i == j) dist[i][j] = MAX_DIST;
}
}
for (size_t k=0; k<num_edges; ++k)
for (size_t i=0; i<num_edges; ++i)
for (size_t j=0; j<num_edges; ++j)
if (dist[i][j] > (dist[i][k] + dist[k][j]))
dist[i][j] = dist[i][k] + dist[k][j];
}

12
src/FloydWarshall.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef moses_FloydWarshall_h
#define moses_FloydWarshall_h
#include <vector>
/**
* Floyd-Warshall all-pairs shortest path algorithm
* See CLR (1990). Introduction to Algorithms, p. 558-565
*/
void floyd_warshall(const std::vector<std::vector<bool> >& edges, std::vector<std::vector<int> >& distances);
#endif

View File

@ -0,0 +1,164 @@
// $Id: GenerationDictionary.cpp 2087 2009-02-06 15:43:06Z redpony $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <fstream>
#include <string>
#include "GenerationDictionary.h"
#include "FactorCollection.h"
#include "Word.h"
#include "Util.h"
#include "InputFileStream.h"
#include "StaticData.h"
#include "UserMessage.h"
using namespace std;
namespace Moses
{
GenerationDictionary::GenerationDictionary(size_t numFeatures, ScoreIndexManager &scoreIndexManager)
: Dictionary(numFeatures)
{
scoreIndexManager.AddScoreProducer(this);
}
bool GenerationDictionary::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
, FactorDirection direction)
{
FactorCollection &factorCollection = FactorCollection::Instance();
const size_t numFeatureValuesInConfig = this->GetNumScoreComponents();
//factors
m_inputFactors = FactorMask(input);
m_outputFactors = FactorMask(output);
VERBOSE(2,"GenerationDictionary: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl);
// data from file
InputFileStream inFile(filePath);
if (!inFile.good()) {
UserMessage::Add(string("Couldn't read ") + filePath);
return false;
}
m_filePath = filePath;
string line;
size_t lineNum = 0;
while(getline(inFile, line))
{
++lineNum;
vector<string> token = Tokenize( line );
// add each line in generation file into class
Word *inputWord = new Word(); // deleted in destructor
Word outputWord;
// create word with certain factors filled out
// inputs
vector<string> factorString = Tokenize( token[0], "|" );
for (size_t i = 0 ; i < input.size() ; i++)
{
FactorType factorType = input[i];
const Factor *factor = factorCollection.AddFactor( direction, factorType, factorString[i]);
inputWord->SetFactor(factorType, factor);
}
factorString = Tokenize( token[1], "|" );
for (size_t i = 0 ; i < output.size() ; i++)
{
FactorType factorType = output[i];
const Factor *factor = factorCollection.AddFactor( direction, factorType, factorString[i]);
outputWord.SetFactor(factorType, factor);
}
size_t numFeaturesInFile = token.size() - 2;
if (numFeaturesInFile < numFeatureValuesInConfig) {
stringstream strme;
strme << filePath << ":" << lineNum << ": expected " << numFeatureValuesInConfig
<< " feature values, but found " << numFeaturesInFile << std::endl;
UserMessage::Add(strme.str());
return false;
}
std::vector<float> scores(numFeatureValuesInConfig, 0.0f);
for (size_t i = 0; i < numFeatureValuesInConfig; i++)
scores[i] = FloorScore(TransformScore(Scan<float>(token[2+i])));
Collection::iterator iterWord = m_collection.find(inputWord);
if (iterWord == m_collection.end())
{
m_collection[inputWord][outputWord].Assign(this, scores);
}
else
{ // source word already in there. delete input word to avoid mem leak
(iterWord->second)[outputWord].Assign(this, scores);
delete inputWord;
}
}
inFile.Close();
return true;
}
GenerationDictionary::~GenerationDictionary()
{
Collection::const_iterator iter;
for (iter = m_collection.begin() ; iter != m_collection.end() ; ++iter)
{
delete iter->first;
}
}
size_t GenerationDictionary::GetNumScoreComponents() const
{
return m_numScoreComponent;
}
std::string GenerationDictionary::GetScoreProducerDescription() const
{
return "Generation score, file=" + m_filePath;
}
const OutputWordCollection *GenerationDictionary::FindWord(const Word &word) const
{
const OutputWordCollection *ret;
Collection::const_iterator iter = m_collection.find(&word);
if (iter == m_collection.end())
{ // can't find source phrase
ret = NULL;
}
else
{
ret = &iter->second;
}
return ret;
}
bool GenerationDictionary::ComputeValueInTranslationOption() const {
return true;
}
}

View File

@ -0,0 +1,96 @@
// $Id: GenerationDictionary.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_GenerationDictionary_h
#define moses_GenerationDictionary_h
#include <list>
#include <map>
#include <vector>
#include "ScoreComponentCollection.h"
#include "Phrase.h"
#include "TypeDef.h"
#include "Dictionary.h"
#include "FeatureFunction.h"
namespace Moses
{
class FactorCollection;
typedef std::map < Word , ScoreComponentCollection > OutputWordCollection;
// 1st = output phrase
// 2nd = log probability (score)
/** Implementation of a generation table in a trie.
*/
class GenerationDictionary : public Dictionary, public StatelessFeatureFunction
{
typedef std::map<const Word* , OutputWordCollection, WordComparer> Collection;
protected:
Collection m_collection;
// 1st = source
// 2nd = target
std::string m_filePath;
public:
/** constructor.
* \param numFeatures number of score components, as specified in ini file
*/
GenerationDictionary(size_t numFeatures, ScoreIndexManager &scoreIndexManager);
virtual ~GenerationDictionary();
// returns Generate
DecodeType GetDecodeType() const
{
return Generate;
}
//! load data file
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
, FactorDirection direction);
size_t GetNumScoreComponents() const;
std::string GetScoreProducerDescription() const;
std::string GetScoreProducerWeightShortName() const
{
return "g";
}
/** number of unique input entries in the generation table.
* NOT the number of lines in the generation table
*/
size_t GetSize() const
{
return m_collection.size();
}
/** returns a bag of output words, OutputWordCollection, for a particular input word.
* Or NULL if the input word isn't found. The search function used is the WordComparer functor
*/
const OutputWordCollection *FindWord(const Word &word) const;
virtual bool ComputeValueInTranslationOption() const;
};
}
#endif

185
src/GlobalLexicalModel.cpp Normal file
View File

@ -0,0 +1,185 @@
#include <fstream>
#include "GlobalLexicalModel.h"
#include "StaticData.h"
#include "InputFileStream.h"
namespace Moses
{
GlobalLexicalModel::GlobalLexicalModel(const string &filePath,
const float weight,
const vector< FactorType >& inFactors,
const vector< FactorType >& outFactors)
{
std::cerr << "Creating global lexical model...\n";
// register as score producer
const_cast<ScoreIndexManager&>(StaticData::Instance().GetScoreIndexManager()).AddScoreProducer(this);
std::vector< float > weights;
weights.push_back( weight );
const_cast<StaticData&>(StaticData::Instance()).SetWeightsForScoreProducer(this, weights);
// load model
LoadData( filePath, inFactors, outFactors );
// define bias word
FactorCollection &factorCollection = FactorCollection::Instance();
m_bias = new Word();
const Factor* factor = factorCollection.AddFactor( Input, inFactors[0], "**BIAS**" );
m_bias->SetFactor( inFactors[0], factor );
m_cache = NULL;
}
GlobalLexicalModel::~GlobalLexicalModel(){
// delete words in the hash data structure
DoubleHash::const_iterator iter;
for(iter = m_hash.begin(); iter != m_hash.end(); iter++ )
{
map< const Word*, float, WordComparer >::const_iterator iter2;
for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ )
{
delete iter2->first; // delete input word
}
delete iter->first; // delete output word
}
if (m_cache != NULL) delete m_cache;
}
void GlobalLexicalModel::LoadData(const string &filePath,
const vector< FactorType >& inFactors,
const vector< FactorType >& outFactors)
{
FactorCollection &factorCollection = FactorCollection::Instance();
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
VERBOSE(2, "Loading global lexical model from file " << filePath << endl);
m_inputFactors = FactorMask(inFactors);
m_outputFactors = FactorMask(outFactors);
InputFileStream inFile(filePath);
// reading in data one line at a time
size_t lineNum = 0;
string line;
while(getline(inFile, line))
{
++lineNum;
vector<string> token = Tokenize<string>(line, " ");
if (token.size() != 3) // format checking
{
stringstream errorMessage;
errorMessage << "Syntax error at " << filePath << ":" << lineNum << endl << line << endl;
UserMessage::Add(errorMessage.str());
abort();
}
// create the output word
Word *outWord = new Word();
vector<string> factorString = Tokenize( token[0], factorDelimiter );
for (size_t i=0 ; i < outFactors.size() ; i++)
{
const FactorDirection& direction = Output;
const FactorType& factorType = outFactors[i];
const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
outWord->SetFactor( factorType, factor );
}
// create the input word
Word *inWord = new Word();
factorString = Tokenize( token[1], factorDelimiter );
for (size_t i=0 ; i < inFactors.size() ; i++)
{
const FactorDirection& direction = Input;
const FactorType& factorType = inFactors[i];
const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
inWord->SetFactor( factorType, factor );
}
// maximum entropy feature score
float score = Scan<float>(token[2]);
// std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl;
// store feature in hash
DoubleHash::iterator keyOutWord = m_hash.find( outWord );
if( keyOutWord == m_hash.end() )
{
m_hash[outWord][inWord] = score;
}
else // already have hash for outword, delete the word to avoid leaks
{
(keyOutWord->second)[inWord] = score;
delete outWord;
}
}
}
void GlobalLexicalModel::InitializeForInput( Sentence const& in )
{
m_input = &in;
if (m_cache != NULL) delete m_cache;
m_cache = new map< const TargetPhrase*, float >;
}
float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
{
float score = 0;
for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ )
{
float sum = 0;
const Word& targetWord = targetPhrase.GetWord( targetIndex );
VERBOSE(2,"glm " << targetWord << ": ");
const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord );
if( targetWordHash != m_hash.end() )
{
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias );
if( inputWordHash != targetWordHash->second.end() )
{
VERBOSE(2,"*BIAS* " << inputWordHash->second);
sum += inputWordHash->second;
}
set< const Word*, WordComparer > alreadyScored; // do not score a word twice
for(size_t inputIndex = 0; inputIndex < m_input->GetSize(); inputIndex++ )
{
const Word& inputWord = m_input->GetWord( inputIndex );
if ( alreadyScored.find( &inputWord ) == alreadyScored.end() )
{
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord );
if( inputWordHash != targetWordHash->second.end() )
{
VERBOSE(2," " << inputWord << " " << inputWordHash->second);
sum += inputWordHash->second;
}
alreadyScored.insert( &inputWord );
}
}
}
// Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] )
VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl);
score += FloorScore( log(1/(1+exp(-sum))) );
}
return score;
}
float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const
{
map< const TargetPhrase*, float >::const_iterator query = m_cache->find( &targetPhrase );
if ( query != m_cache->end() )
{
return query->second;
}
float score = ScorePhrase( targetPhrase );
m_cache->insert( pair<const TargetPhrase*, float>(&targetPhrase, score) );
std::cerr << "add to cache " << targetPhrase << ": " << score << endl;
return score;
}
void GlobalLexicalModel::Evaluate(const TargetPhrase& targetPhrase, ScoreComponentCollection* accumulator) const
{
accumulator->PlusEquals( this, GetFromCacheOrScorePhrase( targetPhrase ) );
}
}

76
src/GlobalLexicalModel.h Normal file
View File

@ -0,0 +1,76 @@
#ifndef moses_GlobalLexicalModel_h
#define moses_GlobalLexicalModel_h
#include <string>
#include <vector>
#include "Factor.h"
#include "Phrase.h"
#include "TypeDef.h"
#include "Util.h"
#include "WordsRange.h"
#include "ScoreProducer.h"
#include "FeatureFunction.h"
#include "FactorTypeSet.h"
#include "Sentence.h"
namespace Moses
{
class Factor;
class Phrase;
class Hypothesis;
class InputType;
using namespace std;
/** Discriminatively trained global lexicon model
* This is a implementation of Mauser et al., 2009's model that predicts
* each output word from _all_ the input words. The intuition behind this
* feature is that it uses context words for disambiguation
*/
class GlobalLexicalModel : public StatelessFeatureFunction {
typedef map< const Word*, map< const Word*, float, WordComparer >, WordComparer > DoubleHash;
typedef map< const Word*, float, WordComparer > SingleHash;
private:
DoubleHash m_hash;
map< const TargetPhrase*, float > *m_cache;
const Sentence *m_input;
Word *m_bias;
FactorMask m_inputFactors;
FactorMask m_outputFactors;
void LoadData(const string &filePath,
const vector< FactorType >& inFactors,
const vector< FactorType >& outFactors);
float ScorePhrase( const TargetPhrase& targetPhrase ) const;
float GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const;
public:
GlobalLexicalModel(const string &filePath,
const float weight,
const vector< FactorType >& inFactors,
const vector< FactorType >& outFactors);
virtual ~GlobalLexicalModel();
virtual size_t GetNumScoreComponents() const {
return 1;
};
virtual string GetScoreProducerDescription() const {
return "GlobalLexicalModel";
};
virtual string GetScoreProducerWeightShortName() const {
return "lex";
};
void InitializeForInput( Sentence const& in );
void Evaluate(const TargetPhrase&, ScoreComponentCollection* ) const;
};
}
#endif

512
src/Hypothesis.cpp Normal file
View File

@ -0,0 +1,512 @@
// $Id: Hypothesis.cpp 2929 2010-02-22 23:42:35Z bhaddow $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cassert>
#include <iostream>
#include <limits>
#include <vector>
#include <algorithm>
#include "FFState.h"
#include "TranslationOption.h"
#include "TranslationOptionCollection.h"
#include "DummyScoreProducers.h"
#include "Hypothesis.h"
#include "Util.h"
#include "SquareMatrix.h"
#include "LexicalReordering.h"
#include "StaticData.h"
#include "InputType.h"
#include "LMList.h"
#include "Manager.h"
#include "hash.h"
using namespace std;
namespace Moses
{
unsigned int Hypothesis::s_HypothesesCreated = 0;
#ifdef USE_HYPO_POOL
ObjectPool<Hypothesis> Hypothesis::s_objectPool("Hypothesis", 300000);
#endif
Hypothesis::Hypothesis(Manager& manager, InputType const& source, const TargetPhrase &emptyTarget)
: m_prevHypo(NULL)
, m_targetPhrase(emptyTarget)
, m_sourcePhrase(0)
, m_sourceCompleted(source.GetSize())
, m_sourceInput(source)
, m_currSourceWordsRange(NOT_FOUND, NOT_FOUND)
, m_currTargetWordsRange(NOT_FOUND, NOT_FOUND)
, m_wordDeleted(false)
, m_ffStates(StaticData::Instance().GetScoreIndexManager().GetStatefulFeatureFunctions().size())
, m_arcList(NULL)
, m_transOpt(NULL)
, m_manager(manager)
, m_id(0)
{ // used for initial seeding of trans process
// initialize scores
//_hash_computed = false;
s_HypothesesCreated = 1;
ResetScore();
const vector<const StatefulFeatureFunction*>& ffs = StaticData::Instance().GetScoreIndexManager().GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i)
m_ffStates[i] = ffs[i]->EmptyHypothesisState();
}
/***
* continue prevHypo by appending the phrases in transOpt
*/
Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt)
: m_prevHypo(&prevHypo)
, m_targetPhrase(transOpt.GetTargetPhrase())
, m_sourcePhrase(transOpt.GetSourcePhrase())
, m_sourceCompleted (prevHypo.m_sourceCompleted )
, m_sourceInput (prevHypo.m_sourceInput)
, m_currSourceWordsRange (transOpt.GetSourceWordsRange())
, m_currTargetWordsRange ( prevHypo.m_currTargetWordsRange.GetEndPos() + 1
,prevHypo.m_currTargetWordsRange.GetEndPos() + transOpt.GetTargetPhrase().GetSize())
, m_wordDeleted(false)
, m_totalScore(0.0f)
, m_futureScore(0.0f)
, m_scoreBreakdown (prevHypo.m_scoreBreakdown)
, m_ffStates(prevHypo.m_ffStates.size())
, m_arcList(NULL)
, m_transOpt(&transOpt)
, m_manager(prevHypo.GetManager())
, m_id(s_HypothesesCreated++)
{
// assert that we are not extending our hypothesis by retranslating something
// that this hypothesis has already translated!
assert(!m_sourceCompleted.Overlap(m_currSourceWordsRange));
//_hash_computed = false;
m_sourceCompleted.SetValue(m_currSourceWordsRange.GetStartPos(), m_currSourceWordsRange.GetEndPos(), true);
m_wordDeleted = transOpt.IsDeletionOption();
}
Hypothesis::~Hypothesis()
{
for (unsigned i = 0; i < m_ffStates.size(); ++i)
delete m_ffStates[i];
if (m_arcList)
{
ArcList::iterator iter;
for (iter = m_arcList->begin() ; iter != m_arcList->end() ; ++iter)
{
FREEHYPO(*iter);
}
m_arcList->clear();
delete m_arcList;
m_arcList = NULL;
}
}
void Hypothesis::AddArc(Hypothesis *loserHypo)
{
if (!m_arcList) {
if (loserHypo->m_arcList) // we don't have an arcList, but loser does
{
this->m_arcList = loserHypo->m_arcList; // take ownership, we'll delete
loserHypo->m_arcList = 0; // prevent a double deletion
}
else
{ this->m_arcList = new ArcList(); }
} else {
if (loserHypo->m_arcList) { // both have an arc list: merge. delete loser
size_t my_size = m_arcList->size();
size_t add_size = loserHypo->m_arcList->size();
this->m_arcList->resize(my_size + add_size, 0);
std::memcpy(&(*m_arcList)[0] + my_size, &(*loserHypo->m_arcList)[0], add_size * sizeof(Hypothesis *));
delete loserHypo->m_arcList;
loserHypo->m_arcList = 0;
} else { // loserHypo doesn't have any arcs
// DO NOTHING
}
}
m_arcList->push_back(loserHypo);
}
/***
* return the subclass of Hypothesis most appropriate to the given translation option
*/
Hypothesis* Hypothesis::CreateNext(const TranslationOption &transOpt, const Phrase* constraint) const
{
return Create(*this, transOpt, constraint);
}
/***
* return the subclass of Hypothesis most appropriate to the given translation option
*/
Hypothesis* Hypothesis::Create(const Hypothesis &prevHypo, const TranslationOption &transOpt, const Phrase* constrainingPhrase)
{
// This method includes code for constraint decoding
bool createHypothesis = true;
if (constrainingPhrase != NULL)
{
size_t constraintSize = constrainingPhrase->GetSize();
size_t start = 1 + prevHypo.GetCurrTargetWordsRange().GetEndPos();
const Phrase &transOptPhrase = transOpt.GetTargetPhrase();
size_t transOptSize = transOptPhrase.GetSize();
size_t endpoint = start + transOptSize - 1;
if (endpoint < constraintSize)
{
WordsRange range(start, endpoint);
Phrase relevantConstraint = constrainingPhrase->GetSubString(range);
if ( ! relevantConstraint.IsCompatible(transOptPhrase) )
{
createHypothesis = false;
}
}
else
{
createHypothesis = false;
}
}
if (createHypothesis)
{
#ifdef USE_HYPO_POOL
Hypothesis *ptr = s_objectPool.getPtr();
return new(ptr) Hypothesis(prevHypo, transOpt);
#else
return new Hypothesis(prevHypo, transOpt);
#endif
}
else
{
// If the previous hypothesis plus the proposed translation option
// fail to match the provided constraint,
// return a null hypothesis.
return NULL;
}
}
/***
* return the subclass of Hypothesis most appropriate to the given target phrase
*/
Hypothesis* Hypothesis::Create(Manager& manager, InputType const& m_source, const TargetPhrase &emptyTarget)
{
#ifdef USE_HYPO_POOL
Hypothesis *ptr = s_objectPool.getPtr();
return new(ptr) Hypothesis(manager, m_source, emptyTarget);
#else
return new Hypothesis(manager, m_source, emptyTarget);
#endif
}
/** check, if two hypothesis can be recombined.
this is actually a sorting function that allows us to
keep an ordered list of hypotheses. This makes recombination
much quicker.
*/
int Hypothesis::RecombineCompare(const Hypothesis &compare) const
{ // -1 = this < compare
// +1 = this > compare
// 0 = this ==compare
int comp = m_sourceCompleted.Compare(compare.m_sourceCompleted);
if (comp != 0)
return comp;
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL) {
comp = m_ffStates[i] - compare.m_ffStates[i];
} else {
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
}
if (comp != 0) return comp;
}
return 0;
}
void Hypothesis::ResetScore()
{
m_scoreBreakdown.ZeroAll();
m_futureScore = m_totalScore = 0.0f;
}
/***
* calculate the logarithm of our total translation score (sum up components)
*/
void Hypothesis::CalcScore(const SquareMatrix &futureScore)
{
// some stateless score producers cache their values in the translation
// option: add these here
// language model scores for n-grams completely contained within a target
// phrase are also included here
m_scoreBreakdown.PlusEquals(m_transOpt->GetScoreBreakdown());
const StaticData &staticData = StaticData::Instance();
clock_t t=0; // used to track time
// compute values of stateless feature functions that were not
// cached in the translation option-- there is no principled distinction
const vector<const StatelessFeatureFunction*>& sfs =
staticData.GetScoreIndexManager().GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
sfs[i]->Evaluate(m_targetPhrase, &m_scoreBreakdown);
}
const vector<const StatefulFeatureFunction*>& ffs =
staticData.GetScoreIndexManager().GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
m_ffStates[i] = ffs[i]->Evaluate(
*this,
m_prevHypo ? m_prevHypo->m_ffStates[i] : NULL,
&m_scoreBreakdown);
}
IFVERBOSE(2) { t = clock(); } // track time excluding LM
// FUTURE COST
m_futureScore = futureScore.CalcFutureScore( m_sourceCompleted );
// TOTAL
m_totalScore = m_scoreBreakdown.InnerProduct(staticData.GetAllWeights()) + m_futureScore;
IFVERBOSE(2) { m_manager.GetSentenceStats().AddTimeOtherScore( clock()-t ); }
}
/** Calculates the expected score of extending this hypothesis with the
* specified translation option. Includes actual costs for everything
* except for expensive actual language model score.
* This function is used by early discarding.
* /param transOpt - translation option being considered
*/
float Hypothesis::CalcExpectedScore( const SquareMatrix &futureScore ) {
const StaticData &staticData = StaticData::Instance();
clock_t t=0;
IFVERBOSE(2) { t = clock(); } // track time excluding LM
assert(!"Need to add code to get the distortion scores");
//CalcDistortionScore();
// LANGUAGE MODEL ESTIMATE (includes word penalty cost)
float estimatedLMScore = m_transOpt->GetFutureScore() - m_transOpt->GetScoreBreakdown().InnerProduct(staticData.GetAllWeights());
// FUTURE COST
m_futureScore = futureScore.CalcFutureScore( m_sourceCompleted );
//LEXICAL REORDERING COST
const std::vector<LexicalReordering*> &reorderModels = staticData.GetReorderModels();
for(unsigned int i = 0; i < reorderModels.size(); i++)
{
m_scoreBreakdown.PlusEquals(reorderModels[i], reorderModels[i]->CalcScore(this));
}
// TOTAL
float total = m_scoreBreakdown.InnerProduct(staticData.GetAllWeights()) + m_futureScore + estimatedLMScore;
IFVERBOSE(2) { m_manager.GetSentenceStats().AddTimeEstimateScore( clock()-t ); }
return total;
}
void Hypothesis::CalcRemainingScore()
{
const StaticData &staticData = StaticData::Instance();
clock_t t=0; // used to track time
// LANGUAGE MODEL COST
assert(!"Need to add code to get the LM score(s)");
//CalcLMScore(staticData.GetAllLM());
IFVERBOSE(2) { t = clock(); } // track time excluding LM
// WORD PENALTY
m_scoreBreakdown.PlusEquals(staticData.GetWordPenaltyProducer(), - (float) m_currTargetWordsRange.GetNumWordsCovered());
// TOTAL
m_totalScore = m_scoreBreakdown.InnerProduct(staticData.GetAllWeights()) + m_futureScore;
IFVERBOSE(2) { m_manager.GetSentenceStats().AddTimeOtherScore( clock()-t ); }
}
const Hypothesis* Hypothesis::GetPrevHypo()const{
return m_prevHypo;
}
/**
* print hypothesis information for pharaoh-style logging
*/
void Hypothesis::PrintHypothesis() const
{
if (!m_prevHypo) { TRACE_ERR(endl << "NULL hypo" << endl); return; }
TRACE_ERR(endl << "creating hypothesis "<< m_id <<" from "<< m_prevHypo->m_id<<" ( ");
int end = (int)(m_prevHypo->m_targetPhrase.GetSize()-1);
int start = end-1;
if ( start < 0 ) start = 0;
if ( m_prevHypo->m_currTargetWordsRange.GetStartPos() == NOT_FOUND ) {
TRACE_ERR( "<s> ");
}
else {
TRACE_ERR( "... ");
}
if (end>=0) {
WordsRange range(start, end);
TRACE_ERR( m_prevHypo->m_targetPhrase.GetSubString(range) << " ");
}
TRACE_ERR( ")"<<endl);
TRACE_ERR( "\tbase score "<< (m_prevHypo->m_totalScore - m_prevHypo->m_futureScore) <<endl);
TRACE_ERR( "\tcovering "<<m_currSourceWordsRange.GetStartPos()<<"-"<<m_currSourceWordsRange.GetEndPos()<<": "
<< *m_sourcePhrase <<endl);
TRACE_ERR( "\ttranslated as: "<<(Phrase&) m_targetPhrase<<endl); // <<" => translation cost "<<m_score[ScoreType::PhraseTrans];
if (m_wordDeleted) TRACE_ERR( "\tword deleted"<<endl);
// TRACE_ERR( "\tdistance: "<<GetCurrSourceWordsRange().CalcDistortion(m_prevHypo->GetCurrSourceWordsRange())); // << " => distortion cost "<<(m_score[ScoreType::Distortion]*weightDistortion)<<endl;
// TRACE_ERR( "\tlanguage model cost "); // <<m_score[ScoreType::LanguageModelScore]<<endl;
// TRACE_ERR( "\tword penalty "); // <<(m_score[ScoreType::WordPenalty]*weightWordPenalty)<<endl;
TRACE_ERR( "\tscore "<<m_totalScore - m_futureScore<<" + future cost "<<m_futureScore<<" = "<<m_totalScore<<endl);
TRACE_ERR( "\tunweighted feature scores: " << m_scoreBreakdown << endl);
//PrintLMScores();
}
void Hypothesis::CleanupArcList()
{
// point this hypo's main hypo to itself
SetWinningHypo(this);
if (!m_arcList) return;
/* keep only number of arcs we need to create all n-best paths.
* However, may not be enough if only unique candidates are needed,
* so we'll keep all of arc list if nedd distinct n-best list
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
if (!distinctNBest && m_arcList->size() > nBestSize * 5)
{ // prune arc list only if there too many arcs
nth_element(m_arcList->begin()
, m_arcList->begin() + nBestSize - 1
, m_arcList->end()
, CompareHypothesisTotalScore());
// delete bad ones
ArcList::iterator iter;
for (iter = m_arcList->begin() + nBestSize ; iter != m_arcList->end() ; ++iter)
{
Hypothesis *arc = *iter;
FREEHYPO(arc);
}
m_arcList->erase(m_arcList->begin() + nBestSize
, m_arcList->end());
}
// set all arc's main hypo variable to this hypo
ArcList::iterator iter = m_arcList->begin();
for (; iter != m_arcList->end() ; ++iter)
{
Hypothesis *arc = *iter;
arc->SetWinningHypo(this);
}
}
TO_STRING_BODY(Hypothesis)
// friend
ostream& operator<<(ostream& out, const Hypothesis& hypothesis)
{
hypothesis.ToStream(out);
// words bitmap
out << "[" << hypothesis.m_sourceCompleted << "] ";
// scores
out << " [total=" << hypothesis.GetTotalScore() << "]";
out << " " << hypothesis.GetScoreBreakdown();
// alignment
return out;
}
std::string Hypothesis::GetSourcePhraseStringRep(const vector<FactorType> factorsToPrint) const
{
if (!m_prevHypo) { return ""; }
return m_sourcePhrase->GetStringRep(factorsToPrint);
#if 0
if(m_sourcePhrase)
{
return m_sourcePhrase->GetSubString(m_currSourceWordsRange).GetStringRep(factorsToPrint);
}
else
{
return m_sourceInput.GetSubString(m_currSourceWordsRange).GetStringRep(factorsToPrint);
}
#endif
}
std::string Hypothesis::GetTargetPhraseStringRep(const vector<FactorType> factorsToPrint) const
{
if (!m_prevHypo) { return ""; }
return m_targetPhrase.GetStringRep(factorsToPrint);
}
std::string Hypothesis::GetSourcePhraseStringRep() const
{
vector<FactorType> allFactors;
const size_t maxSourceFactors = StaticData::Instance().GetMaxNumFactors(Input);
for(size_t i=0; i < maxSourceFactors; i++)
{
allFactors.push_back(i);
}
return GetSourcePhraseStringRep(allFactors);
}
std::string Hypothesis::GetTargetPhraseStringRep() const
{
vector<FactorType> allFactors;
const size_t maxTargetFactors = StaticData::Instance().GetMaxNumFactors(Output);
for(size_t i=0; i < maxTargetFactors; i++)
{
allFactors.push_back(i);
}
return GetTargetPhraseStringRep(allFactors);
}
const ScoreComponentCollection &Hypothesis::GetCachedReorderingScore() const
{
return m_transOpt->GetReorderingScore();
}
}

322
src/Hypothesis.h Normal file
View File

@ -0,0 +1,322 @@
// $Id: Hypothesis.h 2939 2010-02-24 11:15:44Z jfouet $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Hypothesis_h
#define moses_Hypothesis_h
#include <iostream>
#include <vector>
#include "Phrase.h"
#include "TypeDef.h"
#include "WordsBitmap.h"
#include "Sentence.h"
#include "Phrase.h"
#include "PhraseDictionaryMemory.h"
#include "GenerationDictionary.h"
#include "LanguageModelSingleFactor.h"
#include "ScoreComponentCollection.h"
#include "LexicalReordering.h"
#include "InputType.h"
#include "ObjectPool.h"
namespace Moses
{
class SquareMatrix;
class StaticData;
class TranslationOption;
class WordsRange;
class Hypothesis;
class FFState;
class Manager;
typedef std::vector<Hypothesis*> ArcList;
/** Used to store a state in the beam search
for the best translation. With its link back to the previous hypothesis
m_prevHypo, we can trace back to the sentence start to read of the
(partial) translation to this point.
The expansion of hypotheses is handled in the class Manager, which
stores active hypothesis in the search in hypothesis stacks.
***/
class Hypothesis
{
friend std::ostream& operator<<(std::ostream&, const Hypothesis&);
protected:
static ObjectPool<Hypothesis> s_objectPool;
const Hypothesis* m_prevHypo; /*! backpointer to previous hypothesis (from which this one was created) */
// const Phrase &m_targetPhrase; /*! target phrase being created at the current decoding step */
const TargetPhrase &m_targetPhrase; /*! target phrase being created at the current decoding step */
Phrase const* m_sourcePhrase; /*! input sentence */
WordsBitmap m_sourceCompleted; /*! keeps track of which words have been translated so far */
//TODO: how to integrate this into confusion network framework; what if
//it's a confusion network in the end???
InputType const& m_sourceInput;
WordsRange m_currSourceWordsRange; /*! source word positions of the last phrase that was used to create this hypothesis */
WordsRange m_currTargetWordsRange; /*! target word positions of the last phrase that was used to create this hypothesis */
bool m_wordDeleted;
float m_totalScore; /*! score so far */
float m_futureScore; /*! estimated future cost to translate rest of sentence */
ScoreComponentCollection m_scoreBreakdown; /*! detailed score break-down by components (for instance language model, word penalty, etc) */
std::vector<const FFState*> m_ffStates;
const Hypothesis *m_winningHypo;
ArcList *m_arcList; /*! all arcs that end at the same trellis point as this hypothesis */
const TranslationOption *m_transOpt;
Manager& m_manager;
int m_id; /*! numeric ID of this hypothesis, used for logging */
static unsigned int s_HypothesesCreated; // Statistics: how many hypotheses were created in total
/*! used by initial seeding of the translation process */
Hypothesis(Manager& manager, InputType const& source, const TargetPhrase &emptyTarget);
/*! used when creating a new hypothesis using a translation option (phrase translation) */
Hypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt);
public:
static ObjectPool<Hypothesis> &GetObjectPool()
{
return s_objectPool;
}
~Hypothesis();
/** return the subclass of Hypothesis most appropriate to the given translation option */
static Hypothesis* Create(const Hypothesis &prevHypo, const TranslationOption &transOpt, const Phrase* constraint);
static Hypothesis* Create(Manager& manager, const WordsBitmap &initialCoverage);
/** return the subclass of Hypothesis most appropriate to the given target phrase */
static Hypothesis* Create(Manager& manager, InputType const& source, const TargetPhrase &emptyTarget);
/** return the subclass of Hypothesis most appropriate to the given translation option */
Hypothesis* CreateNext(const TranslationOption &transOpt, const Phrase* constraint) const;
void PrintHypothesis() const;
const InputType& GetInput() const {return m_sourceInput;}
/** return target phrase used to create this hypothesis */
// const Phrase &GetCurrTargetPhrase() const
const TargetPhrase &GetCurrTargetPhrase() const
{
return m_targetPhrase;
}
// void PrintLMScores(const LMList &lmListInitial, const LMList &lmListEnd) const;
/** return input positions covered by the translation option (phrasal translation) used to create this hypothesis */
inline const WordsRange &GetCurrSourceWordsRange() const
{
return m_currSourceWordsRange;
}
inline const WordsRange &GetCurrTargetWordsRange() const
{
return m_currTargetWordsRange;
}
Manager& GetManager() const
{
return m_manager;
}
/** output length of the translation option used to create this hypothesis */
inline size_t GetCurrTargetLength() const
{
return m_currTargetWordsRange.GetNumWordsCovered();
}
void ResetScore();
void CalcScore(const SquareMatrix &futureScore);
float CalcExpectedScore( const SquareMatrix &futureScore );
void CalcRemainingScore();
int GetId()const
{
return m_id;
}
const Hypothesis* GetPrevHypo() const;
/** length of the partial translation (from the start of the sentence) */
inline size_t GetSize() const
{
return m_currTargetWordsRange.GetEndPos() + 1;
}
inline const Phrase* GetSourcePhrase() const
{
return m_sourcePhrase;
}
std::string GetSourcePhraseStringRep(const vector<FactorType> factorsToPrint) const;
std::string GetTargetPhraseStringRep(const vector<FactorType> factorsToPrint) const;
inline const TargetPhrase GetTargetPhrase() const { return m_targetPhrase; }
std::string GetSourcePhraseStringRep() const;
std::string GetTargetPhraseStringRep() const;
/** curr - pos is relative from CURRENT hypothesis's starting index
* (ie, start of sentence would be some negative number, which is
* not allowed- USE WITH CAUTION) */
inline const Word &GetCurrWord(size_t pos) const
{
return m_targetPhrase.GetWord(pos);
}
inline const Factor *GetCurrFactor(size_t pos, FactorType factorType) const
{
return m_targetPhrase.GetFactor(pos, factorType);
}
/** recursive - pos is relative from start of sentence */
inline const Word &GetWord(size_t pos) const
{
const Hypothesis *hypo = this;
while (pos < hypo->GetCurrTargetWordsRange().GetStartPos())
{
hypo = hypo->GetPrevHypo();
assert(hypo != NULL);
}
return hypo->GetCurrWord(pos - hypo->GetCurrTargetWordsRange().GetStartPos());
}
inline const Factor* GetFactor(size_t pos, FactorType factorType) const
{
return GetWord(pos)[factorType];
}
/***
* \return The bitmap of source words we cover
*/
inline const WordsBitmap &GetWordsBitmap() const
{
return m_sourceCompleted;
}
inline bool IsSourceCompleted() const {
return m_sourceCompleted.IsComplete();
}
int RecombineCompare(const Hypothesis &compare) const;
void ToStream(std::ostream& out) const
{
if (m_prevHypo != NULL)
{
m_prevHypo->ToStream(out);
}
out << (Phrase) GetCurrTargetPhrase();
}
inline bool PrintAlignmentInfo() const{ return GetCurrTargetPhrase().PrintAlignmentInfo(); }
TO_STRING();
inline void SetWinningHypo(const Hypothesis *hypo)
{
m_winningHypo = hypo;
}
inline const Hypothesis *GetWinningHypo() const
{
return m_winningHypo;
}
void AddArc(Hypothesis *loserHypo);
void CleanupArcList();
//! returns a list alternative previous hypotheses (or NULL if n-best support is disabled)
inline const ArcList* GetArcList() const
{
return m_arcList;
}
const ScoreComponentCollection& GetScoreBreakdown() const
{
return m_scoreBreakdown;
}
float GetTotalScore() const { return m_totalScore; }
float GetScore() const { return m_totalScore-m_futureScore; }
//! target span that trans opt would populate if applied to this hypo. Used for alignment check
size_t GetNextStartPos(const TranslationOption &transOpt) const;
std::vector<std::vector<unsigned int> > *GetLMStats() const { return NULL; }
static unsigned int GetHypothesesCreated()
{
return s_HypothesesCreated;
}
const ScoreComponentCollection &GetCachedReorderingScore() const;
const TranslationOption &GetTranslationOption() const
{ return *m_transOpt; }
};
std::ostream& operator<<(std::ostream& out, const Hypothesis& hypothesis);
// sorting helper
struct CompareHypothesisTotalScore
{
bool operator()(const Hypothesis* hypo1, const Hypothesis* hypo2) const
{
return hypo1->GetTotalScore() > hypo2->GetTotalScore();
}
};
#ifdef USE_HYPO_POOL
#define FREEHYPO(hypo) \
{ \
ObjectPool<Hypothesis> &pool = Hypothesis::GetObjectPool(); \
pool.freeObject(hypo); \
} \
#else
#define FREEHYPO(hypo) delete hypo
#endif
/** defines less-than relation on hypotheses.
* The particular order is not important for us, we need just to figure out
* which hypothesis are equal based on:
* the last n-1 target words are the same
* and the covers (source words translated) are the same
*/
class HypothesisRecombinationOrderer
{
public:
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const
{
return hypoA->RecombineCompare(*hypoB) < 0;
}
};
}
#endif

31
src/HypothesisStack.cpp Normal file
View File

@ -0,0 +1,31 @@
#include "HypothesisStack.h"
namespace Moses
{
HypothesisStack::~HypothesisStack()
{
// delete all hypos
while (m_hypos.begin() != m_hypos.end())
{
Remove(m_hypos.begin());
}
}
/** Remove hypothesis pointed to by iterator but don't delete the object. */
void HypothesisStack::Detach(const HypothesisStack::iterator &iter)
{
m_hypos.erase(iter);
}
void HypothesisStack::Remove(const HypothesisStack::iterator &iter)
{
Hypothesis *h = *iter;
Detach(iter);
FREEHYPO(h);
}
}

48
src/HypothesisStack.h Normal file
View File

@ -0,0 +1,48 @@
#ifndef moses_HypothesisStack_h
#define moses_HypothesisStack_h
#include <vector>
#include <set>
#include "Hypothesis.h"
#include "WordsBitmap.h"
namespace Moses
{
class Manager;
class HypothesisStack
{
protected:
typedef std::set< Hypothesis*, HypothesisRecombinationOrderer > _HCType;
_HCType m_hypos; /**< contains hypotheses */
Manager& m_manager;
public:
HypothesisStack(Manager& manager): m_manager(manager) {}
typedef _HCType::iterator iterator;
typedef _HCType::const_iterator const_iterator;
//! iterators
const_iterator begin() const { return m_hypos.begin(); }
const_iterator end() const { return m_hypos.end(); }
size_t size() const { return m_hypos.size(); }
virtual inline float GetWorstScore() const { return -numeric_limits<float>::infinity(); };
virtual float GetWorstScoreForBitmap( WordsBitmapID ) { return -numeric_limits<float>::infinity(); };
virtual float GetWorstScoreForBitmap( WordsBitmap ) { return -numeric_limits<float>::infinity(); };
virtual ~HypothesisStack();
virtual bool AddPrune(Hypothesis *hypothesis) = 0;
virtual const Hypothesis *GetBestHypothesis() const = 0;
virtual std::vector<const Hypothesis*> GetSortedList() const = 0;
//! remove hypothesis pointed to by iterator but don't delete the object
virtual void Detach(const HypothesisStack::iterator &iter);
/** destroy Hypothesis pointed to by iterator (object pool version) */
virtual void Remove(const HypothesisStack::iterator &iter);
};
}
#endif

View File

@ -0,0 +1,315 @@
// $Id: HypothesisStackCubePruning.cpp 2477 2009-08-07 16:47:54Z bhaddow $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <algorithm>
#include <set>
#include <queue>
#include "HypothesisStackCubePruning.h"
#include "TypeDef.h"
#include "Util.h"
#include "StaticData.h"
#include "Manager.h"
using namespace std;
namespace Moses
{
HypothesisStackCubePruning::HypothesisStackCubePruning(Manager& manager) :
HypothesisStack(manager)
{
m_nBestIsEnabled = StaticData::Instance().IsNBestEnabled();
m_bestScore = -std::numeric_limits<float>::infinity();
m_worstScore = -std::numeric_limits<float>::infinity();
}
/** remove all hypotheses from the collection */
void HypothesisStackCubePruning::RemoveAll()
{
// delete all bitmap accessors;
_BMType::iterator iter;
for (iter = m_bitmapAccessor.begin(); iter != m_bitmapAccessor.end(); ++iter)
{
delete iter->second;
}
}
pair<HypothesisStackCubePruning::iterator, bool> HypothesisStackCubePruning::Add(Hypothesis *hypo)
{
std::pair<iterator, bool> ret = m_hypos.insert(hypo);
if (ret.second)
{ // equiv hypo doesn't exists
VERBOSE(3,"added hyp to stack");
// Update best score, if this hypothesis is new best
if (hypo->GetTotalScore() > m_bestScore)
{
VERBOSE(3,", best on stack");
m_bestScore = hypo->GetTotalScore();
// this may also affect the worst score
if ( m_bestScore + m_beamWidth > m_worstScore )
m_worstScore = m_bestScore + m_beamWidth;
}
// Prune only if stack is twice as big as needed (lazy pruning)
VERBOSE(3,", now size " << m_hypos.size());
if (m_hypos.size() > 2*m_maxHypoStackSize-1)
{
PruneToSize(m_maxHypoStackSize);
}
else {
VERBOSE(3,std::endl);
}
}
return ret;
}
bool HypothesisStackCubePruning::AddPrune(Hypothesis *hypo)
{
if (hypo->GetTotalScore() < m_worstScore)
{ // too bad for stack. don't bother adding hypo into collection
m_manager.GetSentenceStats().AddDiscarded();
VERBOSE(3,"discarded, too bad for stack" << std::endl);
FREEHYPO(hypo);
return false;
}
// over threshold, try to add to collection
std::pair<iterator, bool> addRet = Add(hypo);
if (addRet.second)
{ // nothing found. add to collection
return true;
}
// equiv hypo exists, recombine with other hypo
iterator &iterExisting = addRet.first;
Hypothesis *hypoExisting = *iterExisting;
assert(iterExisting != m_hypos.end());
m_manager.GetSentenceStats().AddRecombination(*hypo, **iterExisting);
// found existing hypo with same target ending.
// keep the best 1
if (hypo->GetTotalScore() > hypoExisting->GetTotalScore())
{ // incoming hypo is better than the one we have
VERBOSE(3,"better than matching hyp " << hypoExisting->GetId() << ", recombining, ");
if (m_nBestIsEnabled) {
hypo->AddArc(hypoExisting);
Detach(iterExisting);
} else {
Remove(iterExisting);
}
bool added = Add(hypo).second;
if (!added)
{
iterExisting = m_hypos.find(hypo);
TRACE_ERR("Offending hypo = " << **iterExisting << endl);
assert(false);
}
return false;
}
else
{ // already storing the best hypo. discard current hypo
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo);
} else {
FREEHYPO(hypo);
}
return false;
}
}
void HypothesisStackCubePruning::AddInitial(Hypothesis *hypo)
{
std::pair<iterator, bool> addRet = Add(hypo);
assert (addRet.second);
const WordsBitmap &bitmap = hypo->GetWordsBitmap();
m_bitmapAccessor[bitmap] = new BitmapContainer(bitmap, *this);
}
void HypothesisStackCubePruning::PruneToSize(size_t newSize)
{
if (m_hypos.size() > newSize) // ok, if not over the limit
{
priority_queue<float> bestScores;
// push all scores to a heap
// (but never push scores below m_bestScore+m_beamWidth)
iterator iter = m_hypos.begin();
float score = 0;
while (iter != m_hypos.end())
{
Hypothesis *hypo = *iter;
score = hypo->GetTotalScore();
if (score > m_bestScore+m_beamWidth)
{
bestScores.push(score);
}
++iter;
}
// pop the top newSize scores (and ignore them, these are the scores of hyps that will remain)
// ensure to never pop beyond heap size
size_t minNewSizeHeapSize = newSize > bestScores.size() ? bestScores.size() : newSize;
for (size_t i = 1 ; i < minNewSizeHeapSize ; i++)
bestScores.pop();
// and remember the threshold
float scoreThreshold = bestScores.top();
// delete all hypos under score threshold
iter = m_hypos.begin();
while (iter != m_hypos.end())
{
Hypothesis *hypo = *iter;
float score = hypo->GetTotalScore();
if (score < scoreThreshold)
{
iterator iterRemove = iter++;
Remove(iterRemove);
m_manager.GetSentenceStats().AddPruning();
}
else
{
++iter;
}
}
VERBOSE(3,", pruned to size " << size() << endl);
IFVERBOSE(3)
{
TRACE_ERR("stack now contains: ");
for(iter = m_hypos.begin(); iter != m_hypos.end(); iter++)
{
Hypothesis *hypo = *iter;
TRACE_ERR( hypo->GetId() << " (" << hypo->GetTotalScore() << ") ");
}
TRACE_ERR( endl);
}
// set the worstScore, so that newly generated hypotheses will not be added if worse than the worst in the stack
m_worstScore = scoreThreshold;
}
}
const Hypothesis *HypothesisStackCubePruning::GetBestHypothesis() const
{
if (!m_hypos.empty())
{
const_iterator iter = m_hypos.begin();
Hypothesis *bestHypo = *iter;
while (++iter != m_hypos.end())
{
Hypothesis *hypo = *iter;
if (hypo->GetTotalScore() > bestHypo->GetTotalScore())
bestHypo = hypo;
}
return bestHypo;
}
return NULL;
}
vector<const Hypothesis*> HypothesisStackCubePruning::GetSortedList() const
{
vector<const Hypothesis*> ret; ret.reserve(m_hypos.size());
std::copy(m_hypos.begin(), m_hypos.end(), std::inserter(ret, ret.end()));
sort(ret.begin(), ret.end(), CompareHypothesisTotalScore());
return ret;
}
void HypothesisStackCubePruning::CleanupArcList()
{
// only necessary if n-best calculations are enabled
if (!m_nBestIsEnabled) return;
iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter)
{
Hypothesis *mainHypo = *iter;
mainHypo->CleanupArcList();
}
}
void HypothesisStackCubePruning::SetBitmapAccessor(const WordsBitmap &newBitmap
, HypothesisStackCubePruning &stack
, const WordsRange &range
, BitmapContainer &bitmapContainer
, const SquareMatrix &futureScore
, const TranslationOptionList &transOptList)
{
_BMType::iterator bcExists = m_bitmapAccessor.find(newBitmap);
BitmapContainer *bmContainer;
if (bcExists == m_bitmapAccessor.end()) {
bmContainer = new BitmapContainer(newBitmap, stack);
m_bitmapAccessor[newBitmap] = bmContainer;
}
else {
bmContainer = bcExists->second;
}
BackwardsEdge *edge = new BackwardsEdge(bitmapContainer
, *bmContainer
, transOptList
, futureScore,
m_manager.GetSource());
bmContainer->AddBackwardsEdge(edge);
}
TO_STRING_BODY(HypothesisStackCubePruning);
// friend
std::ostream& operator<<(std::ostream& out, const HypothesisStackCubePruning& hypoColl)
{
HypothesisStackCubePruning::const_iterator iter;
for (iter = hypoColl.begin() ; iter != hypoColl.end() ; ++iter)
{
const Hypothesis &hypo = **iter;
out << hypo << endl;
}
return out;
}
void
HypothesisStackCubePruning::AddHypothesesToBitmapContainers()
{
HypothesisStackCubePruning::const_iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter)
{
Hypothesis *h = *iter;
const WordsBitmap &bitmap = h->GetWordsBitmap();
BitmapContainer *container = m_bitmapAccessor[bitmap];
container->AddHypothesis(h);
}
}
}

View File

@ -0,0 +1,154 @@
// $Id: HypothesisStackCubePruning.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_HypothesisStackCubePruning_h
#define moses_HypothesisStackCubePruning_h
#include <limits>
#include <map>
#include <set>
#include "Hypothesis.h"
#include "BitmapContainer.h"
#include "HypothesisStack.h"
namespace Moses
{
class BitmapContainer;
class TranslationOptionList;
class Manager;
typedef std::map<WordsBitmap, BitmapContainer*> _BMType;
/** Stack for instances of Hypothesis, includes functions for pruning. */
class HypothesisStackCubePruning : public HypothesisStack
{
public:
friend std::ostream& operator<<(std::ostream&, const HypothesisStackCubePruning&);
protected:
_BMType m_bitmapAccessor;
float m_bestScore; /**< score of the best hypothesis in collection */
float m_worstScore; /**< score of the worse hypthesis in collection */
float m_beamWidth; /**< minimum score due to threashold pruning */
size_t m_maxHypoStackSize; /**< maximum number of hypothesis allowed in this stack */
bool m_nBestIsEnabled; /**< flag to determine whether to keep track of old arcs */
/** add hypothesis to stack. Prune if necessary.
* Returns false if equiv hypo exists in collection, otherwise returns true
*/
std::pair<HypothesisStackCubePruning::iterator, bool> Add(Hypothesis *hypothesis);
/** destroy all instances of Hypothesis in this collection */
void RemoveAll();
public:
HypothesisStackCubePruning(Manager& manager);
~HypothesisStackCubePruning()
{
RemoveAll();
m_bitmapAccessor.clear();
}
/** adds the hypo, but only if within thresholds (beamThr, stackSize).
* This function will recombine hypotheses silently! There is no record
* (could affect n-best list generation...TODO)
* Call stack for adding hypothesis is
AddPrune()
Add()
AddNoPrune()
*/
bool AddPrune(Hypothesis *hypothesis);
void AddInitial(Hypothesis *hypo);
/** set maximum number of hypotheses in the collection
* \param maxHypoStackSize maximum number (typical number: 100)
*/
inline void SetMaxHypoStackSize(size_t maxHypoStackSize)
{
m_maxHypoStackSize = maxHypoStackSize;
}
inline size_t GetMaxHypoStackSize() const
{
return m_maxHypoStackSize;
}
/** set beam threshold, hypotheses in the stack must not be worse than
* this factor times the best score to be allowed in the stack
* \param beamThreshold minimum factor (typical number: 0.03)
*/
inline void SetBeamWidth(float beamWidth)
{
m_beamWidth = beamWidth;
}
/** return score of the best hypothesis in the stack */
inline float GetBestScore() const
{
return m_bestScore;
}
/** return worst score allowed for the stack */
inline float GetWorstScore() const
{
return m_worstScore;
}
void AddHypothesesToBitmapContainers();
const _BMType& GetBitmapAccessor() const
{
return m_bitmapAccessor;
}
void SetBitmapAccessor(const WordsBitmap &newBitmap
, HypothesisStackCubePruning &stack
, const WordsRange &range
, BitmapContainer &bitmapContainer
, const SquareMatrix &futureScore
, const TranslationOptionList &transOptList);
/** pruning, if too large.
* Pruning algorithm: find a threshold and delete all hypothesis below it.
* The threshold is chosen so that exactly newSize top items remain on the
* stack in fact, in situations where some of the hypothesis fell below
* m_beamWidth, the stack will contain less items.
* \param newSize maximum size */
void PruneToSize(size_t newSize);
//! return the hypothesis with best score. Used to get the translated at end of decoding
const Hypothesis *GetBestHypothesis() const;
//! return all hypothesis, sorted by descending score. Used in creation of N best list
std::vector<const Hypothesis*> GetSortedList() const;
/** make all arcs in point to the equiv hypothesis that contains them.
* Ie update doubly linked list be hypo & arcs
*/
void CleanupArcList();
TO_STRING();
};
}
#endif

View File

@ -0,0 +1,303 @@
// $Id: HypothesisStackNormal.cpp 1511 2007-11-12 20:21:44Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <algorithm>
#include <set>
#include <queue>
#include "HypothesisStackNormal.h"
#include "TypeDef.h"
#include "Util.h"
#include "StaticData.h"
#include "Manager.h"
using namespace std;
namespace Moses
{
HypothesisStackNormal::HypothesisStackNormal(Manager& manager) :
HypothesisStack(manager)
{
m_nBestIsEnabled = StaticData::Instance().IsNBestEnabled();
m_bestScore = -std::numeric_limits<float>::infinity();
m_worstScore = -std::numeric_limits<float>::infinity();
}
/** remove all hypotheses from the collection */
void HypothesisStackNormal::RemoveAll()
{
while (m_hypos.begin() != m_hypos.end())
{
Remove(m_hypos.begin());
}
}
pair<HypothesisStackNormal::iterator, bool> HypothesisStackNormal::Add(Hypothesis *hypo)
{
std::pair<iterator, bool> ret = m_hypos.insert(hypo);
if (ret.second)
{ // equiv hypo doesn't exists
VERBOSE(3,"added hyp to stack");
// Update best score, if this hypothesis is new best
if (hypo->GetTotalScore() > m_bestScore)
{
VERBOSE(3,", best on stack");
m_bestScore = hypo->GetTotalScore();
// this may also affect the worst score
if ( m_bestScore + m_beamWidth > m_worstScore )
m_worstScore = m_bestScore + m_beamWidth;
}
// update best/worst score for stack diversity 1
if ( m_minHypoStackDiversity == 1 &&
hypo->GetTotalScore() > GetWorstScoreForBitmap( hypo->GetWordsBitmap() ) )
{
SetWorstScoreForBitmap( hypo->GetWordsBitmap().GetID(), hypo->GetTotalScore() );
}
VERBOSE(3,", now size " << m_hypos.size());
// prune only if stack is twice as big as needed (lazy pruning)
size_t toleratedSize = 2*m_maxHypoStackSize-1;
// add in room for stack diversity
if (m_minHypoStackDiversity)
toleratedSize += m_minHypoStackDiversity << StaticData::Instance().GetMaxDistortion();
if (m_hypos.size() > toleratedSize)
{
PruneToSize(m_maxHypoStackSize);
}
else {
VERBOSE(3,std::endl);
}
}
return ret;
}
bool HypothesisStackNormal::AddPrune(Hypothesis *hypo)
{
// too bad for stack. don't bother adding hypo into collection
if (!StaticData::Instance().GetDisableDiscarding() &&
hypo->GetTotalScore() < m_worstScore
&& ! ( m_minHypoStackDiversity > 0
&& hypo->GetTotalScore() >= GetWorstScoreForBitmap( hypo->GetWordsBitmap() ) ) )
{
m_manager.GetSentenceStats().AddDiscarded();
VERBOSE(3,"discarded, too bad for stack" << std::endl);
FREEHYPO(hypo);
return false;
}
// over threshold, try to add to collection
std::pair<iterator, bool> addRet = Add(hypo);
if (addRet.second)
{ // nothing found. add to collection
return true;
}
// equiv hypo exists, recombine with other hypo
iterator &iterExisting = addRet.first;
Hypothesis *hypoExisting = *iterExisting;
assert(iterExisting != m_hypos.end());
m_manager.GetSentenceStats().AddRecombination(*hypo, **iterExisting);
// found existing hypo with same target ending.
// keep the best 1
if (hypo->GetTotalScore() > hypoExisting->GetTotalScore())
{ // incoming hypo is better than the one we have
VERBOSE(3,"better than matching hyp " << hypoExisting->GetId() << ", recombining, ");
if (m_nBestIsEnabled) {
hypo->AddArc(hypoExisting);
Detach(iterExisting);
} else {
Remove(iterExisting);
}
bool added = Add(hypo).second;
if (!added)
{
iterExisting = m_hypos.find(hypo);
TRACE_ERR("Offending hypo = " << **iterExisting << endl);
abort();
}
return false;
}
else
{ // already storing the best hypo. discard current hypo
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo);
} else {
FREEHYPO(hypo);
}
return false;
}
}
void HypothesisStackNormal::PruneToSize(size_t newSize)
{
if ( size() <= newSize ) return; // ok, if not over the limit
// we need to store a temporary list of hypotheses
vector< Hypothesis* > hypos = GetSortedListNOTCONST();
bool* included = (bool*) malloc(sizeof(bool) * hypos.size());
for(size_t i=0; i<hypos.size(); i++) included[i] = false;
// clear out original set
for( iterator iter = m_hypos.begin(); iter != m_hypos.end(); )
{
iterator removeHyp = iter++;
Detach(removeHyp);
}
// add best hyps for each coverage according to minStackDiversity
if ( m_minHypoStackDiversity > 0 )
{
map< WordsBitmapID, size_t > diversityCount;
for(size_t i=0; i<hypos.size(); i++)
{
Hypothesis *hyp = hypos[i];
WordsBitmapID coverage = hyp->GetWordsBitmap().GetID();;
if (diversityCount.find( coverage ) == diversityCount.end())
diversityCount[ coverage ] = 0;
if (diversityCount[ coverage ] < m_minHypoStackDiversity)
{
m_hypos.insert( hyp );
included[i] = true;
diversityCount[ coverage ]++;
if (diversityCount[ coverage ] == m_minHypoStackDiversity)
SetWorstScoreForBitmap( coverage, hyp->GetTotalScore());
}
}
}
// only add more if stack not full after satisfying minStackDiversity
if ( size() < newSize ) {
// add best remaining hypotheses
for(size_t i=0; i<hypos.size()
&& size() < newSize
&& hypos[i]->GetTotalScore() > m_bestScore+m_beamWidth; i++)
{
if (! included[i])
{
m_hypos.insert( hypos[i] );
included[i] = true;
if (size() == newSize)
m_worstScore = hypos[i]->GetTotalScore();
}
}
}
// delete hypotheses that have not been included
for(size_t i=0; i<hypos.size(); i++)
{
if (! included[i])
{
FREEHYPO( hypos[i] );
m_manager.GetSentenceStats().AddPruning();
}
}
free(included);
// some reporting....
VERBOSE(3,", pruned to size " << size() << endl);
IFVERBOSE(3)
{
TRACE_ERR("stack now contains: ");
for(iterator iter = m_hypos.begin(); iter != m_hypos.end(); iter++)
{
Hypothesis *hypo = *iter;
TRACE_ERR( hypo->GetId() << " (" << hypo->GetTotalScore() << ") ");
}
TRACE_ERR( endl);
}
}
const Hypothesis *HypothesisStackNormal::GetBestHypothesis() const
{
if (!m_hypos.empty())
{
const_iterator iter = m_hypos.begin();
Hypothesis *bestHypo = *iter;
while (++iter != m_hypos.end())
{
Hypothesis *hypo = *iter;
if (hypo->GetTotalScore() > bestHypo->GetTotalScore())
bestHypo = hypo;
}
return bestHypo;
}
return NULL;
}
vector<const Hypothesis*> HypothesisStackNormal::GetSortedList() const
{
vector<const Hypothesis*> ret; ret.reserve(m_hypos.size());
std::copy(m_hypos.begin(), m_hypos.end(), std::inserter(ret, ret.end()));
sort(ret.begin(), ret.end(), CompareHypothesisTotalScore());
return ret;
}
vector<Hypothesis*> HypothesisStackNormal::GetSortedListNOTCONST()
{
vector<Hypothesis*> ret; ret.reserve(m_hypos.size());
std::copy(m_hypos.begin(), m_hypos.end(), std::inserter(ret, ret.end()));
sort(ret.begin(), ret.end(), CompareHypothesisTotalScore());
return ret;
}
void HypothesisStackNormal::CleanupArcList()
{
// only necessary if n-best calculations are enabled
if (!m_nBestIsEnabled) return;
iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter)
{
Hypothesis *mainHypo = *iter;
mainHypo->CleanupArcList();
}
}
TO_STRING_BODY(HypothesisStackNormal);
// friend
std::ostream& operator<<(std::ostream& out, const HypothesisStackNormal& hypoColl)
{
HypothesisStackNormal::const_iterator iter;
for (iter = hypoColl.begin() ; iter != hypoColl.end() ; ++iter)
{
const Hypothesis &hypo = **iter;
out << hypo << endl;
}
return out;
}
}

137
src/HypothesisStackNormal.h Normal file
View File

@ -0,0 +1,137 @@
// $Id: HypothesisStackNormal.h 1511 2007-11-12 20:21:44Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_HypothesisStackNormal_h
#define moses_HypothesisStackNormal_h
#include <limits>
#include <set>
#include "Hypothesis.h"
#include "HypothesisStack.h"
#include "WordsBitmap.h"
namespace Moses
{
// class WordsBitmap;
// typedef size_t WordsBitmapID;
/** Stack for instances of Hypothesis, includes functions for pruning. */
class HypothesisStackNormal: public HypothesisStack
{
public:
friend std::ostream& operator<<(std::ostream&, const HypothesisStackNormal&);
protected:
float m_bestScore; /**< score of the best hypothesis in collection */
float m_worstScore; /**< score of the worse hypothesis in collection */
map< WordsBitmapID, float > m_diversityWorstScore; /**< score of worst hypothesis for particular source word coverage */
float m_beamWidth; /**< minimum score due to threashold pruning */
size_t m_maxHypoStackSize; /**< maximum number of hypothesis allowed in this stack */
size_t m_minHypoStackDiversity; /**< minimum number of hypothesis with different source word coverage */
bool m_nBestIsEnabled; /**< flag to determine whether to keep track of old arcs */
/** add hypothesis to stack. Prune if necessary.
* Returns false if equiv hypo exists in collection, otherwise returns true
*/
std::pair<HypothesisStackNormal::iterator, bool> Add(Hypothesis *hypothesis);
/** destroy all instances of Hypothesis in this collection */
void RemoveAll();
void SetWorstScoreForBitmap( WordsBitmapID id, float worstScore ) {
m_diversityWorstScore[ id ] = worstScore;
}
public:
float GetWorstScoreForBitmap( WordsBitmapID id ) {
if (m_diversityWorstScore.find( id ) == m_diversityWorstScore.end())
return -numeric_limits<float>::infinity();
return m_diversityWorstScore[ id ];
}
float GetWorstScoreForBitmap( const WordsBitmap &coverage ) {
return GetWorstScoreForBitmap( coverage.GetID() );
}
HypothesisStackNormal(Manager& manager);
/** adds the hypo, but only if within thresholds (beamThr, stackSize).
* This function will recombine hypotheses silently! There is no record
* (could affect n-best list generation...TODO)
* Call stack for adding hypothesis is
AddPrune()
Add()
AddNoPrune()
*/
bool AddPrune(Hypothesis *hypothesis);
/** set maximum number of hypotheses in the collection
* \param maxHypoStackSize maximum number (typical number: 100)
* \param maxHypoStackSize maximum number (defauly: 0)
*/
inline void SetMaxHypoStackSize(size_t maxHypoStackSize, size_t minHypoStackDiversity)
{
m_maxHypoStackSize = maxHypoStackSize;
m_minHypoStackDiversity = minHypoStackDiversity;
}
/** set beam threshold, hypotheses in the stack must not be worse than
* this factor times the best score to be allowed in the stack
* \param beamThreshold minimum factor (typical number: 0.03)
*/
inline void SetBeamWidth(float beamWidth)
{
m_beamWidth = beamWidth;
}
/** return score of the best hypothesis in the stack */
inline float GetBestScore() const
{
return m_bestScore;
}
/** return worst allowable score */
inline float GetWorstScore() const
{
return m_worstScore;
}
/** pruning, if too large.
* Pruning algorithm: find a threshold and delete all hypothesis below it.
* The threshold is chosen so that exactly newSize top items remain on the
* stack in fact, in situations where some of the hypothesis fell below
* m_beamWidth, the stack will contain less items.
* \param newSize maximum size */
void PruneToSize(size_t newSize);
//! return the hypothesis with best score. Used to get the translated at end of decoding
const Hypothesis *GetBestHypothesis() const;
//! return all hypothesis, sorted by descending score. Used in creation of N best list
std::vector<const Hypothesis*> GetSortedList() const;
std::vector<Hypothesis*> GetSortedListNOTCONST();
/** make all arcs in point to the equiv hypothesis that contains them.
* Ie update doubly linked list be hypo & arcs
*/
void CleanupArcList();
TO_STRING();
};
}
#endif

62
src/InputFileStream.cpp Normal file
View File

@ -0,0 +1,62 @@
// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "InputFileStream.h"
#include "gzfilebuf.h"
#include <iostream>
using namespace std;
namespace Moses
{
InputFileStream::InputFileStream(const std::string &filePath)
: std::istream(NULL)
, m_streambuf(NULL)
{
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz")
{
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
fb = fb->open(filePath.c_str(), std::ios::in);
if (! fb) {
cerr << "Can't read " << filePath.c_str() << endl;
exit(1);
}
m_streambuf = fb;
}
this->init(m_streambuf);
}
InputFileStream::~InputFileStream()
{
delete m_streambuf;
m_streambuf = NULL;
}
void InputFileStream::Close()
{
}
}

48
src/InputFileStream.h Normal file
View File

@ -0,0 +1,48 @@
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_InputFileStream_h
#define moses_InputFileStream_h
#include <cstdlib>
#include <fstream>
#include <string>
namespace Moses
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class InputFileStream : public std::istream
{
protected:
std::streambuf *m_streambuf;
public:
InputFileStream(const std::string &filePath);
~InputFileStream();
void Close();
};
}
#endif

59
src/InputType.cpp Normal file
View File

@ -0,0 +1,59 @@
// $Id: InputType.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cstdlib>
#include "InputType.h"
namespace Moses
{
InputType::InputType(long translationId) : m_translationId(translationId) {}
InputType::~InputType() {}
TO_STRING_BODY(InputType);
std::ostream& operator<<(std::ostream& out,InputType const& x)
{
x.Print(out); return out;
}
// default implementation is one column equals one word
int InputType::ComputeDistortionDistance(const WordsRange& prev, const WordsRange& current) const
{
int dist = 0;
if (prev.GetNumWordsCovered() == 0) {
dist = current.GetStartPos();
} else {
dist = (int)prev.GetEndPos() - (int)current.GetStartPos() + 1 ;
}
return abs(dist);
}
bool InputType::CanIGetFromAToB(size_t start, size_t end) const
{
return true;
}
}

132
src/InputType.h Normal file
View File

@ -0,0 +1,132 @@
// $Id: InputType.h 2939 2010-02-24 11:15:44Z jfouet $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_InputType_h
#define moses_InputType_h
#include <string>
#include "TypeDef.h"
#include "Phrase.h"
#include "TargetPhraseCollection.h"
#include "ReorderingConstraint.h"
namespace Moses
{
class WordsRange;
class Factor;
class PhraseDictionary;
class TranslationOptionCollection;
//! base class for sentences and confusion networks
class InputType
{
protected:
long m_translationId; //< contiguous Id
bool m_hasMetaData;
long m_segId;
ReorderingConstraint m_reorderingConstraint; /**< limits on reordering specified either by "-mp" switch or xml tags */
public:
InputType(long translationId = 0);
virtual ~InputType();
virtual InputTypeEnum GetType() const = 0;
long GetTranslationId() const
{
return m_translationId;
}
void SetTranslationId(long translationId)
{
m_translationId = translationId;
}
//! returns the number of words moved
virtual int ComputeDistortionDistance(const WordsRange& prev, const WordsRange& current) const;
//! In a word lattice, tells you if there's a path from node start to node end
virtual bool CanIGetFromAToB(size_t start, size_t end) const;
//! is there a path covering [range] (lattice only, otherwise true)
inline bool IsCoveragePossible(const WordsRange& range) const
{
return CanIGetFromAToB(range.GetStartPos(), range.GetEndPos() + 1);
}
//! In a word lattice, you can't always get from node A to node B
inline bool IsExtensionPossible(const WordsRange& prev, const WordsRange& current) const
{
// return ComputeDistortionDistance(prev, current) < 100000;
size_t t = prev.GetEndPos()+1; // 2
size_t l = current.GetEndPos()+1; //l=1
size_t r = l;
if (l<t) { r = t; } else { l = t; } //r=2
if (!CanIGetFromAToB(l,r)) return false;
// there's another check here: a current span may end at a place that previous could get to,
// but it may not *START* at a place it can get to. We'll also have to check if we're going left or right
r = current.GetStartPos();
l = prev.GetEndPos()+1;
if (l == r) return true;
if (prev.GetEndPos() > current.GetStartPos()) {
r = prev.GetStartPos();
l = current.GetEndPos()+1;
if (r == l) return true;
}
return CanIGetFromAToB(l,r);
}
//! number of words in this sentence/confusion network
virtual size_t GetSize() const =0;
//! populate this InputType with data from in stream
virtual int Read(std::istream& in,const std::vector<FactorType>& factorOrder) =0;
//! Output debugging info to stream out
virtual void Print(std::ostream&) const =0;
//! create trans options specific to this InputType
virtual TranslationOptionCollection* CreateTranslationOptionCollection() const=0;
//! return substring. Only valid for Sentence class. TODO - get rid of this fn
virtual Phrase GetSubString(const WordsRange&) const =0;
//! return substring at a particular position. Only valid for Sentence class. TODO - get rid of this fn
virtual const Word& GetWord(size_t pos) const=0;
//! Returns the reordering constraints
const ReorderingConstraint& GetReorderingConstraint() const
{
return m_reorderingConstraint;
};
TO_STRING();
};
std::ostream& operator<<(std::ostream&,InputType const&);
}
#endif

54
src/LMList.cpp Normal file
View File

@ -0,0 +1,54 @@
// $Id: LMList.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "LMList.h"
#include "Phrase.h"
#include "LanguageModelSingleFactor.h"
#include "ScoreComponentCollection.h"
using namespace std;
namespace Moses
{
void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGramScore, ScoreComponentCollection* breakdown) const
{
const_iterator lmIter;
for (lmIter = begin(); lmIter != end(); ++lmIter)
{
const LanguageModel &lm = **lmIter;
const float weightLM = lm.GetWeight();
float fullScore, nGramScore;
// do not process, if factors not defined yet (happens in partial translation options)
if (!lm.Useable(phrase))
continue;
lm.CalcScore(phrase, fullScore, nGramScore);
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
retFullScore += fullScore * weightLM;
retNGramScore += nGramScore * weightLM;
}
}
}

23
src/LMList.h Normal file
View File

@ -0,0 +1,23 @@
#ifndef moses_LMList_h
#define moses_LMList_h
#include <list>
#include "LanguageModel.h"
namespace Moses
{
class Phrase;
class ScoreColl;
class ScoreComponentCollection;
//! List of language models
class LMList : public std::list < LanguageModel* >
{
public:
void CalcScore(const Phrase &phrase, float &retFullScore, float &retNGramScore, ScoreComponentCollection* breakdown) const;
};
}
#endif

7
src/LVoc.cpp Normal file
View File

@ -0,0 +1,7 @@
#include<limits>
#include "LVoc.h"
//rather pointless file because LVoc is template all wee need here is the definitions of consts
const LabelId InvalidLabelId = std::numeric_limits<LabelId>::max();
const LabelId Epsilon = InvalidLabelId-1;

68
src/LVoc.h Normal file
View File

@ -0,0 +1,68 @@
#ifndef moses_LVoc_h
#define moses_LVoc_h
#include<map>
#include<vector>
#include<iostream>
#include<fstream>
#include <sstream>
typedef unsigned LabelId;
extern const LabelId InvalidLabelId;
extern const LabelId Epsilon;
typedef std::vector<LabelId> IPhrase;
// A = type of things to numberize, ie, std::string
// B = map type to use, might consider using hash_map for better performance
template<typename A,typename B=std::map<A,LabelId> >
class LVoc {
typedef A Key;
typedef B M;
typedef std::vector<Key> V;
M m;
V data;
public:
LVoc() {}
bool isKnown(const Key& k) const {return m.find(k)!=m.end();}
LabelId index(const Key& k) const {
typename M::const_iterator i=m.find(k);
return i!=m.end()? i->second : InvalidLabelId;}
LabelId add(const Key& k) {
std::pair<typename M::iterator,bool> p
=m.insert(std::make_pair(k,data.size()));
if(p.second) data.push_back(k);
assert(static_cast<size_t>(p.first->second)<data.size());
return p.first->second;
}
Key const& symbol(LabelId i) const {
assert(static_cast<size_t>(i)<data.size());
return data[i];}
typedef typename V::const_iterator const_iterator;
const_iterator begin() const {return data.begin();}
const_iterator end() const {return data.end();}
void Write(const std::string& fname) const {
std::ofstream out(fname.c_str()); Write(out);}
void Write(std::ostream& out) const {
for(int i=data.size()-1;i>=0;--i)
out<<i<<' '<<data[i]<<'\n';
}
void Read(const std::string& fname) {
std::ifstream in(fname.c_str());Read(in);}
void Read(std::istream& in) {
Key k;size_t i;std::string line;
while(getline(in,line)) {
std::istringstream is(line);
if(is>>i>>k) {
if(i>=data.size()) data.resize(i+1);
data[i]=k;
m[k]=i;
}
}
}
};
#endif

191
src/LanguageModel.cpp Normal file
View File

@ -0,0 +1,191 @@
// $Id: LanguageModel.cpp 2477 2009-08-07 16:47:54Z bhaddow $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cassert>
#include <limits>
#include <iostream>
#include <sstream>
#include "FFState.h"
#include "LanguageModel.h"
#include "TypeDef.h"
#include "Util.h"
#include "Manager.h"
#include "FactorCollection.h"
#include "Phrase.h"
#include "StaticData.h"
using namespace std;
namespace Moses
{
LanguageModel::LanguageModel(bool registerScore, ScoreIndexManager &scoreIndexManager)
{
if (registerScore)
scoreIndexManager.AddScoreProducer(this);
}
LanguageModel::~LanguageModel() {}
// don't inline virtual funcs...
size_t LanguageModel::GetNumScoreComponents() const
{
return 1;
}
void LanguageModel::CalcScore(const Phrase &phrase
, float &fullScore
, float &ngramScore) const
{
fullScore = 0;
ngramScore = 0;
size_t phraseSize = phrase.GetSize();
vector<const Word*> contextFactor;
contextFactor.reserve(m_nGramOrder);
// start of sentence
for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
{
contextFactor.push_back(&phrase.GetWord(currPos));
fullScore += GetValue(contextFactor);
}
if (phraseSize >= m_nGramOrder)
{
contextFactor.push_back(&phrase.GetWord(m_nGramOrder - 1));
ngramScore = GetValue(contextFactor);
}
// main loop
for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
{ // used by hypo to speed up lm score calc
for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
{
contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
}
contextFactor[m_nGramOrder - 1] = &phrase.GetWord(currPos);
float partScore = GetValue(contextFactor);
ngramScore += partScore;
}
fullScore += ngramScore;
}
LanguageModel::State LanguageModel::GetState(const std::vector<const Word*> &contextFactor, unsigned int* len) const
{
State state;
unsigned int dummy;
if (!len) len = &dummy;
GetValue(contextFactor,&state,len);
return state;
}
struct LMState : public FFState {
const void* lmstate;
LMState(const void* lms) { lmstate = lms; }
virtual int Compare(const FFState& o) const {
const LMState& other = static_cast<const LMState&>(o);
if (other.lmstate > lmstate) return 1;
else if (other.lmstate < lmstate) return -1;
return 0;
}
};
const FFState* LanguageModel::EmptyHypothesisState() const {
return new LMState(NULL);
}
FFState* LanguageModel::Evaluate(
const Hypothesis& hypo,
const FFState* ps,
ScoreComponentCollection* out) const {
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
// translation option. In the unigram case, there is no overlap, so we don't
// need to do anything.
if(m_nGramOrder <= 1)
return NULL;
clock_t t=0;
IFVERBOSE(2) { t = clock(); } // track time
const void* prevlm = ps ? (static_cast<const LMState *>(ps)->lmstate) : NULL;
LMState* res = new LMState(prevlm);
if (hypo.GetCurrTargetLength() == 0)
return res;
const size_t currEndPos = hypo.GetCurrTargetWordsRange().GetEndPos();
const size_t startPos = hypo.GetCurrTargetWordsRange().GetStartPos();
// 1st n-gram
vector<const Word*> contextFactor(m_nGramOrder);
size_t index = 0;
for (int currPos = (int) startPos - (int) m_nGramOrder + 1 ; currPos <= (int) startPos ; currPos++)
{
if (currPos >= 0)
contextFactor[index++] = &hypo.GetWord(currPos);
else
contextFactor[index++] = &GetSentenceStartArray();
}
float lmScore = GetValue(contextFactor);
//cout<<"context factor: "<<GetValue(contextFactor)<<endl;
// main loop
size_t endPos = std::min(startPos + m_nGramOrder - 2
, currEndPos);
for (size_t currPos = startPos + 1 ; currPos <= endPos ; currPos++)
{
// shift all args down 1 place
for (size_t i = 0 ; i < m_nGramOrder - 1 ; i++)
contextFactor[i] = contextFactor[i + 1];
// add last factor
contextFactor.back() = &hypo.GetWord(currPos);
lmScore += GetValue(contextFactor);
}
// end of sentence
if (hypo.IsSourceCompleted())
{
const size_t size = hypo.GetSize();
contextFactor.back() = &GetSentenceEndArray();
for (size_t i = 0 ; i < m_nGramOrder - 1 ; i ++)
{
int currPos = (int)(size - m_nGramOrder + i + 1);
if (currPos < 0)
contextFactor[i] = &GetSentenceStartArray();
else
contextFactor[i] = &hypo.GetWord((size_t)currPos);
}
lmScore += GetValue(contextFactor, &res->lmstate);
} else {
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
for (size_t i = 0 ; i < m_nGramOrder - 1 ; i++)
contextFactor[i] = contextFactor[i + 1];
contextFactor.back() = &hypo.GetWord(currPos);
}
res->lmstate = GetState(contextFactor);
}
out->PlusEquals(this, lmScore);
IFVERBOSE(2) { hypo.GetManager().GetSentenceStats().AddTimeCalcLM( clock()-t ); }
return res;
}
}

146
src/LanguageModel.h Normal file
View File

@ -0,0 +1,146 @@
// $Id: LanguageModel.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModel_h
#define moses_LanguageModel_h
#include <string>
#include <vector>
#include "Factor.h"
#include "TypeDef.h"
#include "Util.h"
#include "FeatureFunction.h"
#include "Word.h"
namespace Moses
{
class FactorCollection;
class Factor;
class Phrase;
//! Abstract base class which represent a language model on a contiguous phrase
class LanguageModel : public StatefulFeatureFunction
{
protected:
float m_weight; //! scoring weight. Shouldn't this now be superceded by ScoreProducer???
std::string m_filePath; //! for debugging purposes
size_t m_nGramOrder; //! max n-gram length contained in this LM
Word m_sentenceStartArray, m_sentenceEndArray; //! Contains factors which represents the beging and end words for this LM.
//! Usually <s> and </s>
/** constructor to be called by inherited class
* \param registerScore whether this LM will be directly used to score sentence.
* Usually true, except where LM is a component in a composite LM, eg. LanguageModelJoint
*/
LanguageModel(bool registerScore, ScoreIndexManager &scoreIndexManager);
public:
/* Returned from LM implementations which points at the state used. For example, if a trigram score was requested
* but the LM backed off to using the trigram, the State pointer will point to the bigram.
* Used for more agressive pruning of hypothesis
*/
typedef const void* State;
virtual ~LanguageModel();
//! see ScoreProducer.h
size_t GetNumScoreComponents() const;
//! Single or multi-factor
virtual LMType GetLMType() const = 0;
/* whether this LM can be used on a particular phrase.
* Should return false if phrase size = 0 or factor types required don't exists
*/
virtual bool Useable(const Phrase &phrase) const = 0;
/* calc total unweighted LM score of this phrase and return score via arguments.
* Return scores should always be in natural log, regardless of representation with LM implementation.
* Uses GetValue() of inherited class.
* Useable() should be called beforehand on the phrase
* \param fullScore scores of all unigram, bigram... of contiguous n-gram of the phrase
* \param ngramScore score of only n-gram of order m_nGramOrder
*/
void CalcScore(const Phrase &phrase
, float &fullScore
, float &ngramScore) const;
/* get score of n-gram. n-gram should not be bigger than m_nGramOrder
* Specific implementation can return State and len data to be used in hypothesis pruning
* \param contextFactor n-gram to be scored
* \param finalState state used by LM. Return arg
* \param len ???
*/
virtual float GetValue(const std::vector<const Word*> &contextFactor
, State* finalState = 0
, unsigned int* len = 0) const = 0;
//! get State for a particular n-gram
State GetState(const std::vector<const Word*> &contextFactor, unsigned int* len = 0) const;
//! max n-gram order of LM
size_t GetNGramOrder() const
{
return m_nGramOrder;
}
//! Contains factors which represents the beging and end words for this LM. Usually <s> and </s>
const Word &GetSentenceStartArray() const
{
return m_sentenceStartArray;
}
const Word &GetSentenceEndArray() const
{
return m_sentenceEndArray;
}
//! scoring weight. Shouldn't this now be superceded by ScoreProducer???
float GetWeight() const
{
return m_weight;
}
void SetWeight(float weight)
{
m_weight = weight;
}
virtual std::string GetScoreProducerDescription() const = 0;
std::string GetScoreProducerWeightShortName() const
{
return "lm";
}
//! overrideable funtions for IRST LM to cleanup. Maybe something to do with on demand/cache loading/unloading
virtual void InitializeBeforeSentenceProcessing(){};
virtual void CleanUpAfterSentenceProcessing() {};
virtual const FFState* EmptyHypothesisState() const;
virtual FFState* Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
};
}
#endif

View File

@ -0,0 +1,151 @@
// $Id: LanguageModelFactory.cpp 2180 2009-02-18 11:35:41Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include "LanguageModelFactory.h"
#include "UserMessage.h"
#include "TypeDef.h"
#include "FactorCollection.h"
// include appropriate header
#ifdef LM_SRI
# include "LanguageModelSRI.h"
#endif
#ifdef LM_IRST
# include "LanguageModelIRST.h"
#endif
#ifdef LM_RAND
# include "LanguageModelRandLM.h"
#endif
#ifdef LM_REMOTE
# include "LanguageModelRemote.h"
#endif
#include "LanguageModelInternal.h"
#include "LanguageModelSkip.h"
#include "LanguageModelJoint.h"
using namespace std;
namespace Moses
{
namespace LanguageModelFactory
{
LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
, const std::vector<FactorType> &factorTypes
, size_t nGramOrder
, const std::string &languageModelFile
, float weight
, ScoreIndexManager &scoreIndexManager
, int dub)
{
LanguageModel *lm = NULL;
switch (lmImplementation)
{
case RandLM:
#ifdef LM_RAND
lm = new LanguageModelRandLM(true,
scoreIndexManager);
#endif
break;
case Remote:
#ifdef LM_REMOTE
lm = new LanguageModelRemote(true,scoreIndexManager);
#endif
break;
case SRI:
#ifdef LM_SRI
lm = new LanguageModelSRI(true, scoreIndexManager);
#elif LM_INTERNAL
lm = new LanguageModelInternal(true, scoreIndexManager);
#endif
break;
case IRST:
#ifdef LM_IRST
lm = new LanguageModelIRST(true, scoreIndexManager, dub);
#endif
break;
case Skip:
#ifdef LM_SRI
lm = new LanguageModelSkip(new LanguageModelSRI(false, scoreIndexManager)
, true
, scoreIndexManager);
#elif LM_INTERNAL
lm = new LanguageModelSkip(new LanguageModelInternal(false, scoreIndexManager)
, true
, scoreIndexManager);
#endif
break;
case Joint:
#ifdef LM_SRI
lm = new LanguageModelJoint(new LanguageModelSRI(false, scoreIndexManager)
, true
, scoreIndexManager);
#elif LM_INTERNAL
lm = new LanguageModelJoint(new LanguageModelInternal(false, scoreIndexManager)
, true
, scoreIndexManager);
#endif
break;
case Internal:
#ifdef LM_INTERNAL
lm = new LanguageModelInternal(true, scoreIndexManager);
#endif
break;
}
if (lm == NULL)
{
UserMessage::Add("Language model type unknown. Probably not compiled into library");
}
else
{
switch (lm->GetLMType())
{
case SingleFactor:
if (! static_cast<LanguageModelSingleFactor*>(lm)->Load(languageModelFile, factorTypes[0], weight, nGramOrder))
{
cerr << "single factor model failed" << endl;
delete lm;
lm = NULL;
}
break;
case MultiFactor:
if (! static_cast<LanguageModelMultiFactor*>(lm)->Load(languageModelFile, factorTypes, weight, nGramOrder))
{
cerr << "multi factor model failed" << endl;
delete lm;
lm = NULL;
}
break;
}
}
return lm;
}
}
}

View File

@ -0,0 +1,34 @@
// $Id: LanguageModelFactory.h 2939 2010-02-24 11:15:44Z jfouet $
#ifndef moses_LanguageModelFactory_h
#define moses_LanguageModelFactory_h
#include <string>
#include <vector>
#include "TypeDef.h"
namespace Moses
{
class LanguageModel;
class ScoreIndexManager;
namespace LanguageModelFactory {
/**
* creates a language model that will use the appropriate
* language model toolkit as its underlying implementation
*/
LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
, const std::vector<FactorType> &factorTypes
, size_t nGramOrder
, const std::string &languageModelFile
, float weight
, ScoreIndexManager &scoreIndexManager
, int dub);
};
}
#endif

236
src/LanguageModelIRST.cpp Normal file
View File

@ -0,0 +1,236 @@
// $Id: LanguageModelIRST.cpp 2650 2010-01-09 19:00:37Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cassert>
#include <limits>
#include <iostream>
#include <fstream>
#include "dictionary.h"
#include "n_gram.h"
#include "lmtable.h"
#include "lmmacro.h"
#include "LanguageModelIRST.h"
#include "TypeDef.h"
#include "Util.h"
#include "FactorCollection.h"
#include "Phrase.h"
#include "InputFileStream.h"
#include "StaticData.h"
using namespace std;
namespace Moses
{
LanguageModelIRST::LanguageModelIRST(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub)
:LanguageModelSingleFactor(registerScore, scoreIndexManager)
,m_lmtb(0),m_lmtb_dub(dub)
{
}
LanguageModelIRST::~LanguageModelIRST()
{
delete m_lmtb;
delete m_lmtb_ng;
}
bool LanguageModelIRST::Load(const std::string &filePath,
FactorType factorType,
float weight,
size_t nGramOrder)
{
char *SepString = " \t\n";
cerr << "In LanguageModelIRST::Load: nGramOrder = " << nGramOrder << "\n";
FactorCollection &factorCollection = FactorCollection::Instance();
m_factorType = factorType;
m_weight = weight;
m_nGramOrder = nGramOrder;
// get name of LM file and, if any, of the micro-macro map file
char *filenamesOrig = strdup(filePath.c_str());
char *filenames = filenamesOrig;
m_filePath = strsep(&filenames, SepString);
// Open the input file (possibly gzipped)
InputFileStream inp(m_filePath);
if (filenames) {
// case LMfile + MAPfile: create an object of lmmacro class and load both LM file and map
cerr << "Loading LM file + MAP\n";
m_mapFilePath = strsep(&filenames, SepString);
if (!FileExists(m_mapFilePath)) {
cerr << "ERROR: Map file <" << m_mapFilePath << "> does not exist\n";
free(filenamesOrig);
return false;
}
InputFileStream inpMap(m_mapFilePath);
m_lmtb = new lmmacro(m_filePath, inp, inpMap);
} else {
// case (standard) LMfile only: create an object of lmtable
cerr << "Loading LM file (no MAP)\n";
m_lmtb = (lmtable *)new lmtable;
// Load the (possibly binary) model
#ifdef WIN32
m_lmtb->load(inp); //don't use memory map
#else
if (m_filePath.compare(m_filePath.size()-3,3,".mm")==0)
m_lmtb->load(inp,m_filePath.c_str(),NULL,1);
else
m_lmtb->load(inp,m_filePath.c_str(),NULL,0);
#endif
}
m_lmtb_ng=new ngram(m_lmtb->getDict()); // ngram of words/micro tags
m_lmtb_size=m_lmtb->maxlevel();
// LM can be ok, just outputs warnings
// Mauro: in the original, the following two instructions are wrongly switched:
m_unknownId = m_lmtb->getDict()->oovcode(); // at the level of micro tags
CreateFactors(factorCollection);
VERBOSE(1, "IRST: m_unknownId=" << m_unknownId << std::endl);
//install caches
m_lmtb->init_probcache();
m_lmtb->init_statecache();
m_lmtb->init_lmtcaches(m_lmtb->maxlevel()>2?m_lmtb->maxlevel()-1:2);
if (m_lmtb_dub >0) m_lmtb->setlogOOVpenalty(m_lmtb_dub);
free(filenamesOrig);
return true;
}
void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection)
{ // add factors which have srilm id
// code copied & paste from SRI LM class. should do template function
std::map<size_t, int> lmIdMap;
size_t maxFactorId = 0; // to create lookup vector later on
dict_entry *entry;
dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags
while ( (entry = iter.next()) != NULL)
{
size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId();
lmIdMap[factorId] = entry->code;
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
}
size_t factorId;
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
factorId = m_sentenceStart->GetId();
m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_);
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceStartArray[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
factorId = m_sentenceEnd->GetId();
m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_);
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
m_lmIdLookup.resize(maxFactorId+1);
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);
map<size_t, int>::iterator iterMap;
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
{
m_lmIdLookup[iterMap->first] = iterMap->second;
}
}
int LanguageModelIRST::GetLmID( const std::string &str ) const
{
return m_lmtb->getDict()->encode( str.c_str() ); // at the level of micro tags
}
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const
{
unsigned int dummy;
if (!len) { len = &dummy; }
FactorType factorType = GetFactorType();
// set up context
size_t count = contextFactor.size();
m_lmtb_ng->size=0;
if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd);
if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart);
for (size_t i = 0 ; i < count ; i++)
{
//int lmId = GetLmID((*contextFactor[i])[factorType]);
#ifdef DEBUG
cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n";
#endif
int lmId = GetLmID((*contextFactor[i])[factorType]->GetString());
// cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId;
m_lmtb_ng->pushc(lmId);
}
if (finalState){
*finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng);
// back off stats not currently available
*len = 0;
}
float prob = m_lmtb->clprob(*m_lmtb_ng);
return TransformIRSTScore(prob);
}
void LanguageModelIRST::CleanUpAfterSentenceProcessing(){
TRACE_ERR( "reset caches\n");
m_lmtb->reset_caches();
#ifndef WIN32
TRACE_ERR( "reset mmap\n");
m_lmtb->reset_mmap();
#endif
}
void LanguageModelIRST::InitializeBeforeSentenceProcessing(){
//nothing to do
#ifdef TRACE_CACHE
m_lmtb->sentence_id++;
#endif
}
}

88
src/LanguageModelIRST.h Normal file
View File

@ -0,0 +1,88 @@
// $Id: LanguageModelIRST.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelIRST_h
#define moses_LanguageModelIRST_h
#include <string>
#include <vector>
#include "Factor.h"
#include "TypeDef.h"
#include "Util.h"
#include "LanguageModelSingleFactor.h"
class lmtable; // irst lm table
class lmmacro; // irst lm for macro tags
class ngram;
namespace Moses
{
class Phrase;
/** Implementation of single factor LM using IRST's code.
* This is the default LM for Moses and is available from the same sourceforge repository
*/
class LanguageModelIRST : public LanguageModelSingleFactor
{
protected:
std::vector<int> m_lmIdLookup;
lmtable* m_lmtb;
ngram* m_lmtb_ng;
int m_unknownId;
int m_lmtb_sentenceStart; //lmtb symbols to initialize ngram with
int m_lmtb_sentenceEnd; //lmt symbol to initialize ngram with
int m_lmtb_size; //max ngram stored in the table
int m_lmtb_dub; //dictionary upperboud
std::string m_mapFilePath;
// float GetValue(LmId wordId, ngram *context) const;
void CreateFactors(FactorCollection &factorCollection);
int GetLmID( const std::string &str ) const;
int GetLmID( const Factor *factor ) const{
size_t factorId = factor->GetId();
return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
};
public:
LanguageModelIRST(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub);
~LanguageModelIRST();
bool Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder);
virtual float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL, unsigned int* len=0) const;
void CleanUpAfterSentenceProcessing();
void InitializeBeforeSentenceProcessing();
void set_dictionary_upperbound(int dub){ m_lmtb_size=dub ;
//m_lmtb->set_dictionary_upperbound(dub);
};
};
}
#endif

View File

@ -0,0 +1,272 @@
#include "LanguageModelInternal.h"
#include "FactorCollection.h"
#include "NGramNode.h"
#include "InputFileStream.h"
#include "StaticData.h"
using namespace std;
namespace Moses
{
LanguageModelInternal::LanguageModelInternal(bool registerScore, ScoreIndexManager &scoreIndexManager)
:LanguageModelSingleFactor(registerScore, scoreIndexManager)
{
}
bool LanguageModelInternal::Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder)
{
assert(nGramOrder <= 3);
if (nGramOrder > 3)
{
UserMessage::Add("Can only do up to trigram. Aborting");
abort();
}
VERBOSE(1, "Loading Internal LM: " << filePath << endl);
FactorCollection &factorCollection = FactorCollection::Instance();
m_filePath = filePath;
m_factorType = factorType;
m_weight = weight;
m_nGramOrder = nGramOrder;
// make sure start & end tags in factor collection
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
m_sentenceStartArray[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// read in file
VERBOSE(1, filePath << endl);
InputFileStream inFile(filePath);
// to create lookup vector later on
size_t maxFactorId = 0;
map<size_t, const NGramNode*> lmIdMap;
string line;
int lineNo = 0;
while( !getline(inFile, line, '\n').eof())
{
lineNo++;
if (line.size() != 0 && line.substr(0,1) != "\\")
{
vector<string> tokens = Tokenize(line, "\t");
if (tokens.size() >= 2)
{
// split unigram/bigram trigrams
vector<string> factorStr = Tokenize(tokens[1], " ");
// create / traverse down tree
NGramCollection *ngramColl = &m_map;
NGramNode *nGram;
const Factor *factor;
for (int currFactor = (int) factorStr.size() - 1 ; currFactor >= 0 ; currFactor--)
{
factor = factorCollection.AddFactor(Output, m_factorType, factorStr[currFactor]);
nGram = ngramColl->GetOrCreateNGram(factor);
ngramColl = nGram->GetNGramColl();
}
NGramNode *rootNGram = m_map.GetNGram(factor);
nGram->SetRootNGram(rootNGram);
// create vector of factors used in this LM
size_t factorId = factor->GetId();
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
lmIdMap[factorId] = rootNGram;
//factorCollection.SetFactorLmId(factor, rootNGram);
float score = TransformSRIScore(Scan<float>(tokens[0]));
nGram->SetScore( score );
if (tokens.size() == 3)
{
float logBackOff = TransformSRIScore(Scan<float>(tokens[2]));
nGram->SetLogBackOff( logBackOff );
}
else
{
nGram->SetLogBackOff( 0 );
}
}
}
}
// add to lookup vector in object
m_lmIdLookup.resize(maxFactorId+1);
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), static_cast<const NGramNode*>(NULL));
map<size_t, const NGramNode*>::iterator iterMap;
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
{
m_lmIdLookup[iterMap->first] = iterMap->second;
}
return true;
}
float LanguageModelInternal::GetValue(const std::vector<const Word*> &contextFactor
, State* finalState
, unsigned int* len) const
{
const size_t ngram = contextFactor.size();
switch (ngram)
{
case 1: return GetValue((*contextFactor[0])[m_factorType], finalState); break;
case 2: return GetValue((*contextFactor[0])[m_factorType]
, (*contextFactor[1])[m_factorType], finalState); break;
case 3: return GetValue((*contextFactor[0])[m_factorType]
, (*contextFactor[1])[m_factorType]
, (*contextFactor[2])[m_factorType], finalState); break;
}
assert (false);
return 0;
}
float LanguageModelInternal::GetValue(const Factor *factor0, State* finalState) const
{
float prob;
const NGramNode *nGram = GetLmID(factor0);
if (nGram == NULL)
{
if (finalState != NULL)
*finalState = NULL;
prob = -numeric_limits<float>::infinity();
}
else
{
if (finalState != NULL)
*finalState = static_cast<const void*>(nGram);
prob = nGram->GetScore();
}
return FloorScore(prob);
}
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, State* finalState) const
{
float score;
const NGramNode *nGram[2];
nGram[1] = GetLmID(factor1);
if (nGram[1] == NULL)
{
if (finalState != NULL)
*finalState = NULL;
score = -numeric_limits<float>::infinity();
}
else
{
nGram[0] = nGram[1]->GetNGram(factor0);
if (nGram[0] == NULL)
{ // something unigram
if (finalState != NULL)
*finalState = static_cast<const void*>(nGram[1]);
nGram[0] = GetLmID(factor0);
if (nGram[0] == NULL)
{ // stops at unigram
score = nGram[1]->GetScore();
}
else
{ // unigram unigram
score = nGram[1]->GetScore() + nGram[0]->GetLogBackOff();
}
}
else
{ // bigram
if (finalState != NULL)
*finalState = static_cast<const void*>(nGram[0]);
score = nGram[0]->GetScore();
}
}
return FloorScore(score);
}
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, const Factor *factor2, State* finalState) const
{
float score;
const NGramNode *nGram[3];
nGram[2] = GetLmID(factor2);
if (nGram[2] == NULL)
{
if (finalState != NULL)
*finalState = NULL;
score = -numeric_limits<float>::infinity();
}
else
{
nGram[1] = nGram[2]->GetNGram(factor1);
if (nGram[1] == NULL)
{ // something unigram
if (finalState != NULL)
*finalState = static_cast<const void*>(nGram[2]);
nGram[1] = GetLmID(factor1);
if (nGram[1] == NULL)
{ // stops at unigram
score = nGram[2]->GetScore();
}
else
{
nGram[0] = nGram[1]->GetNGram(factor0);
if (nGram[0] == NULL)
{ // unigram unigram
score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff();
}
else
{ // unigram bigram
score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff() + nGram[0]->GetLogBackOff();
}
}
}
else
{ // trigram, or something bigram
nGram[0] = nGram[1]->GetNGram(factor0);
if (nGram[0] != NULL)
{ // trigram
if (finalState != NULL)
*finalState = static_cast<const void*>(nGram[0]);
score = nGram[0]->GetScore();
}
else
{
if (finalState != NULL)
*finalState = static_cast<const void*>(nGram[1]);
score = nGram[1]->GetScore();
nGram[1] = nGram[1]->GetRootNGram();
nGram[0] = nGram[1]->GetNGram(factor0);
if (nGram[0] == NULL)
{ // just bigram
// do nothing
}
else
{
score += nGram[0]->GetLogBackOff();
}
}
// else do nothing. just use 1st bigram
}
}
return FloorScore(score);
}
}

View File

@ -0,0 +1,41 @@
#ifndef moses_LanguageModelInternal_h
#define moses_LanguageModelInternal_h
#include "LanguageModelSingleFactor.h"
#include "NGramCollection.h"
namespace Moses
{
/** Guaranteed cross-platform LM implementation designed to mimic LM used in regression tests
*/
class LanguageModelInternal : public LanguageModelSingleFactor
{
protected:
std::vector<const NGramNode*> m_lmIdLookup;
NGramCollection m_map;
const NGramNode* GetLmID( const Factor *factor ) const
{
size_t factorId = factor->GetId();
return ( factorId >= m_lmIdLookup.size()) ? NULL : m_lmIdLookup[factorId];
};
float GetValue(const Factor *factor0, State* finalState) const;
float GetValue(const Factor *factor0, const Factor *factor1, State* finalState) const;
float GetValue(const Factor *factor0, const Factor *factor1, const Factor *factor2, State* finalState) const;
public:
LanguageModelInternal(bool registerScore, ScoreIndexManager &scoreIndexManager);
bool Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder);
float GetValue(const std::vector<const Word*> &contextFactor
, State* finalState = 0
, unsigned int* len = 0) const;
};
}
#endif

View File

@ -0,0 +1,22 @@
// $Id: LanguageModelJoint.cpp 886 2006-10-17 11:07:17Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "LanguageModelJoint.h"

133
src/LanguageModelJoint.h Normal file
View File

@ -0,0 +1,133 @@
// $Id: LanguageModelJoint.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelJoint_h
#define moses_LanguageModelJoint_h
#include <vector>
#include <string>
#include <sstream>
#include "LanguageModelSingleFactor.h"
#include "LanguageModelMultiFactor.h"
#include "Word.h"
#include "FactorTypeSet.h"
#include "FactorCollection.h"
namespace Moses
{
class Phrase;
class FactorCollection;
/** LM of multiple factors. A simple extension of single factor LM - factors backoff together.
* Rather slow as this uses string concatenation/split
*/
class LanguageModelJoint : public LanguageModelMultiFactor
{
protected:
LanguageModelSingleFactor *m_lmImpl;
std::vector<FactorType> m_factorTypesOrdered;
size_t m_implFactor;
public:
LanguageModelJoint(LanguageModelSingleFactor *lmImpl, bool registerScore, ScoreIndexManager &scoreIndexManager)
:LanguageModelMultiFactor(registerScore, scoreIndexManager)
{
m_lmImpl = lmImpl;
}
~LanguageModelJoint()
{
delete m_lmImpl;
}
bool Load(const std::string &filePath
, const std::vector<FactorType> &factorTypes
, float weight
, size_t nGramOrder)
{
m_factorTypes = FactorMask(factorTypes);
m_weight = weight;
m_filePath = filePath;
m_nGramOrder = nGramOrder;
m_factorTypesOrdered= factorTypes;
m_implFactor = 0;
FactorCollection &factorCollection = FactorCollection::Instance();
// sentence markers
for (size_t index = 0 ; index < factorTypes.size() ; ++index)
{
FactorType factorType = factorTypes[index];
m_sentenceStartArray[factorType] = factorCollection.AddFactor(Output, factorType, BOS_);
m_sentenceEndArray[factorType] = factorCollection.AddFactor(Output, factorType, EOS_);
}
return m_lmImpl->Load(filePath, m_implFactor, weight, nGramOrder);
}
float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL, unsigned int* len = NULL) const
{
if (contextFactor.size() == 0)
{
return 0;
}
// joint context for internal LM
std::vector<const Word*> jointContext;
for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos )
{
const Word &word = *contextFactor[currPos];
// add word to chunked context
std::stringstream stream("");
const Factor *factor = word[ m_factorTypesOrdered[0] ];
stream << factor->GetString();
for (size_t index = 1 ; index < m_factorTypesOrdered.size() ; ++index)
{
FactorType factorType = m_factorTypesOrdered[index];
const Factor *factor = word[factorType];
stream << "|" << factor->GetString();
}
factor = FactorCollection::Instance().AddFactor(Output, m_implFactor, stream.str());
Word* jointWord = new Word;
jointWord->SetFactor(m_implFactor, factor);
jointContext.push_back(jointWord);
}
// calc score on chunked phrase
float ret = m_lmImpl->GetValue(jointContext, finalState, len);
RemoveAllInColl(jointContext);
return ret;
}
};
}
#endif

View File

@ -0,0 +1,56 @@
// $Id: LanguageModelMultiFactor.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "LanguageModelMultiFactor.h"
#include "Phrase.h"
namespace Moses
{
LanguageModelMultiFactor::LanguageModelMultiFactor(bool registerScore, ScoreIndexManager &scoreIndexManager)
:LanguageModel(registerScore, scoreIndexManager)
{}
std::string LanguageModelMultiFactor::GetScoreProducerDescription() const
{
std::ostringstream oss;
// what about LMs that are over multiple factors at once, POS + stem, for example?
oss << GetNGramOrder() << "-gram LM score, factor-type= ??? " << ", file=" << m_filePath;
return oss.str();
}
bool LanguageModelMultiFactor::Useable(const Phrase &phrase) const
{
if (phrase.GetSize()==0)
return false;
// whether phrase contains all factors in this LM
const Word &word = phrase.GetWord(0);
for (size_t currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; ++currFactor)
{
if (m_factorTypes[currFactor] && word[currFactor] == NULL)
return false;
}
return true;
}
}

View File

@ -0,0 +1,60 @@
// $Id: LanguageModelMultiFactor.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelMultiFactor_h
#define moses_LanguageModelMultiFactor_h
#include <vector>
#include <string>
#include "LanguageModel.h"
#include "Word.h"
#include "FactorTypeSet.h"
namespace Moses
{
class Phrase;
//! Abstract class for for multi factor LM
class LanguageModelMultiFactor : public LanguageModel
{
protected:
FactorMask m_factorTypes;
LanguageModelMultiFactor(bool registerScore, ScoreIndexManager &scoreIndexManager);
public:
virtual bool Load(const std::string &filePath
, const std::vector<FactorType> &factorTypes
, float weight
, size_t nGramOrder) = 0;
LMType GetLMType() const
{
return MultiFactor;
}
std::string GetScoreProducerDescription() const;
bool Useable(const Phrase &phrase) const;
};
}
#endif

114
src/LanguageModelRandLM.cpp Normal file
View File

@ -0,0 +1,114 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cassert>
#include <limits>
#include <iostream>
#include <fstream>
#include "LanguageModelRandLM.h"
#include "FactorCollection.h"
#include "Phrase.h"
#include "InputFileStream.h"
#include "StaticData.h"
namespace Moses
{
bool LanguageModelRandLM::Load(const std::string &filePath, FactorType factorType, float weight,
size_t nGramOrder) {
cerr << "Loading LanguageModelRandLM..." << endl;
FactorCollection &factorCollection = FactorCollection::Instance();
m_filePath = filePath;
m_factorType = factorType;
m_weight = weight;
m_nGramOrder = nGramOrder;
int cache_MB = 50; // increase cache size
m_lm = randlm::RandLM::initRandLM(filePath, nGramOrder, cache_MB);
assert(m_lm != NULL);
// get special word ids
m_oov_id = m_lm->getWordID(m_lm->getOOV());
CreateFactors(factorCollection);
return true;
}
void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection) { // add factors which have randlm id
// code copied & paste from SRI LM class. should do template function
// first get all bf vocab in map
std::map<size_t, randlm::WordID> randlm_ids_map; // map from factor id -> randlm id
size_t maxFactorId = 0; // to create lookup vector later on
for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart();
vIter != m_lm->vocabEnd(); vIter++){
// get word from randlm vocab and associate with (new) factor id
size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId();
randlm_ids_map[factorId] = vIter->second;
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
}
// add factors for BOS and EOS and store bf word ids
size_t factorId;
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS());
factorId = m_sentenceStart->GetId();
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceStartArray[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS());
factorId = m_sentenceEnd->GetId();
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
m_randlm_ids_vec.resize(maxFactorId+1);
// fill with OOV code
fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id);
for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin();
iter != randlm_ids_map.end() ; ++iter)
m_randlm_ids_vec[iter->first] = iter->second;
}
randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const {
return m_lm->getWordID(str);
}
float LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
State* finalState, unsigned int* len) const {
unsigned int dummy; // is this needed ?
if (!len) { len = &dummy; }
FactorType factorType = GetFactorType();
// set up context
randlm::WordID ngram[MAX_NGRAM_SIZE];
int count = contextFactor.size();
for (int i = 0 ; i < count ; i++) {
ngram[i] = GetLmID((*contextFactor[i])[factorType]);
//std::cerr << m_lm->getWord(ngram[i]) << " ";
}
int found = 0;
float logprob = FloorScore(TransformSRIScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
*len = 0; // not available
//if (finalState)
// std::cerr << " = " << logprob << "(" << *finalState << ", " << *len <<")"<< std::endl;
//else
// std::cerr << " = " << logprob << std::endl;
return logprob;
}
}

67
src/LanguageModelRandLM.h Normal file
View File

@ -0,0 +1,67 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelRandLM_h
#define moses_LanguageModelRandLM_h
#include <string>
#include <vector>
#include "Factor.h"
#include "Util.h"
#include "LanguageModelSingleFactor.h"
#include "RandLM.h"
class randlm::RandLM;
namespace Moses
{
class Factor;
class Phrase;
// RandLM wrapper (single factor LM)
class LanguageModelRandLM : public LanguageModelSingleFactor {
public:
LanguageModelRandLM(bool registerScore, ScoreIndexManager &scoreIndexManager)
: LanguageModelSingleFactor(registerScore, scoreIndexManager), m_lm(0) {}
bool Load(const std::string &filePath, FactorType factorType, float weight, size_t nGramOrder);
virtual float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL, unsigned int* len=0) const;
~LanguageModelRandLM() {
delete m_lm;
}
void CleanUpAfterSentenceProcessing() {
m_lm->clearCaches(); // clear caches
}
void InitializeBeforeSentenceProcessing() {} // nothing to do
protected:
std::vector<randlm::WordID> m_randlm_ids_vec;
randlm::RandLM* m_lm;
randlm::WordID m_oov_id;
void CreateFactors(FactorCollection &factorCollection);
randlm::WordID GetLmID( const std::string &str ) const;
randlm::WordID GetLmID( const Factor *factor ) const{
size_t factorId = factor->GetId();
return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
};
};
}
#endif

139
src/LanguageModelRemote.cpp Normal file
View File

@ -0,0 +1,139 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include "LanguageModelRemote.h"
#include "Factor.h"
namespace Moses {
const Factor* LanguageModelRemote::BOS = NULL;
const Factor* LanguageModelRemote::EOS = (LanguageModelRemote::BOS + 1);
LanguageModelRemote::LanguageModelRemote(bool registerScore, ScoreIndexManager &scoreIndexManager)
:LanguageModelSingleFactor(registerScore, scoreIndexManager)
{
}
bool LanguageModelRemote::Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder)
{
m_factorType = factorType;
m_weight = weight;
m_nGramOrder = nGramOrder;
int cutAt = filePath.find(':',0);
std::string host = filePath.substr(0,cutAt);
//std::cerr << "port string = '" << filePath.substr(cutAt+1,filePath.size()-cutAt) << "'\n";
int port = atoi(filePath.substr(cutAt+1,filePath.size()-cutAt).c_str());
bool good = start(host,port);
if (!good) {
std::cerr << "failed to connect to lm server on " << host << " on port " << port << std::endl;
}
ClearSentenceCache();
return good;
}
bool LanguageModelRemote::start(const std::string& host, int port) {
//std::cerr << "host = " << host << ", port = " << port << "\n";
sock = socket(AF_INET, SOCK_STREAM, 0);
hp = gethostbyname(host.c_str());
if (hp==NULL) { herror("gethostbyname failed"); exit(1); }
bzero((char *)&server, sizeof(server));
bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
server.sin_family = hp->h_addrtype;
server.sin_port = htons(port);
int errors = 0;
while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
//std::cerr << "Error: connect()\n";
sleep(1);
errors++;
if (errors > 5) return false;
}
return true;
}
float LanguageModelRemote::GetValue(const std::vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const {
size_t count = contextFactor.size();
if (count == 0) {
if (finalState) *finalState = NULL;
return 0;
}
//std::cerr << "contextFactor.size() = " << count << "\n";
size_t max = m_nGramOrder;
const FactorType factor = GetFactorType();
if (max > count) max = count;
Cache* cur = &m_cache;
int pc = static_cast<int>(count) - 1;
for (int i = 0; i < pc; ++i) {
const Factor* f = contextFactor[i]->GetFactor(factor);
cur = &cur->tree[f ? f : BOS];
}
const Factor* event_word = contextFactor[pc]->GetFactor(factor);
cur = &cur->tree[event_word ? event_word : EOS];
if (cur->prob) {
if (finalState) *finalState = cur->boState;
if (len) *len = m_nGramOrder;
return cur->prob;
}
cur->boState = *reinterpret_cast<const State*>(&m_curId);
++m_curId;
std::ostringstream os;
os << "prob ";
if (event_word == NULL) {
os << "</s>";
} else {
os << event_word->GetString();
}
for (size_t i=1; i<max; i++) {
const Factor* f = contextFactor[count-1-i]->GetFactor(factor);
if (f == NULL) {
os << " <s>";
} else {
os << ' ' << f->GetString();
}
}
os << std::endl;
std::string out = os.str();
write(sock, out.c_str(), out.size());
char res[6];
int r = read(sock, res, 6);
int errors = 0;
int cnt = 0;
while (1) {
if (r < 0) {
errors++; sleep(1);
//std::cerr << "Error: read()\n";
if (errors > 5) exit(1);
} else if (r==0 || res[cnt] == '\n') { break; }
else {
cnt += r;
if (cnt==6) break;
read(sock, &res[cnt], 6-cnt);
}
}
cur->prob = FloorScore(TransformSRIScore(*reinterpret_cast<float*>(res)));
if (finalState) {
*finalState = cur->boState;
if (len) *len = m_nGramOrder;
}
return cur->prob;
}
LanguageModelRemote::~LanguageModelRemote() {
// Step 8 When finished send all lingering transmissions and close the connection
close(sock);
}
}

43
src/LanguageModelRemote.h Normal file
View File

@ -0,0 +1,43 @@
#ifndef moses_LanguageModelRemote_h
#define moses_LanguageModelRemote_h
#include "LanguageModelSingleFactor.h"
#include "TypeDef.h"
#include "Factor.h"
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
namespace Moses
{
class LanguageModelRemote : public LanguageModelSingleFactor {
private:
struct Cache {
std::map<const Factor*, Cache> tree;
float prob;
State boState;
Cache() : prob(0) {}
};
int sock, port;
struct hostent *hp;
struct sockaddr_in server;
mutable size_t m_curId;
mutable Cache m_cache;
bool start(const std::string& host, int port);
static const Factor* BOS;
static const Factor* EOS;
public:
LanguageModelRemote(bool registerScore, ScoreIndexManager &scoreIndexManager);
~LanguageModelRemote();
void ClearSentenceCache() { m_cache.tree.clear(); m_curId = 1000; }
virtual float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0, unsigned int* len = 0) const;
bool Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder);
};
}
#endif

174
src/LanguageModelSRI.cpp Normal file
View File

@ -0,0 +1,174 @@
// $Id: LanguageModelSRI.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cassert>
#include <limits>
#include <iostream>
#include <fstream>
#include "Ngram.h"
#include "Vocab.h"
#include "LanguageModelSRI.h"
#include "TypeDef.h"
#include "Util.h"
#include "FactorCollection.h"
#include "Phrase.h"
#include "StaticData.h"
using namespace std;
namespace Moses
{
LanguageModelSRI::LanguageModelSRI(bool registerScore, ScoreIndexManager &scoreIndexManager)
:LanguageModelSingleFactor(registerScore, scoreIndexManager)
, m_srilmVocab(0)
, m_srilmModel(0)
{
}
LanguageModelSRI::~LanguageModelSRI()
{
delete m_srilmModel;
delete m_srilmVocab;
}
bool LanguageModelSRI::Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder)
{
m_srilmVocab = new Vocab();
m_srilmModel = new Ngram(*m_srilmVocab, nGramOrder);
m_factorType = factorType;
m_weight = weight;
m_nGramOrder = nGramOrder;
m_filePath = filePath;
m_srilmModel->skipOOVs() = false;
File file( filePath.c_str(), "r" );
m_srilmModel->read(file);
// LM can be ok, just outputs warnings
CreateFactors();
m_unknownId = m_srilmVocab->unkIndex();
return true;
}
void LanguageModelSRI::CreateFactors()
{ // add factors which have srilm id
FactorCollection &factorCollection = FactorCollection::Instance();
std::map<size_t, VocabIndex> lmIdMap;
size_t maxFactorId = 0; // to create lookup vector later on
VocabString str;
VocabIter iter(*m_srilmVocab);
while ( (str = iter.next()) != NULL)
{
VocabIndex lmId = GetLmID(str);
size_t factorId = factorCollection.AddFactor(Output, m_factorType, str)->GetId();
lmIdMap[factorId] = lmId;
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
}
size_t factorId;
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
factorId = m_sentenceStart->GetId();
lmIdMap[factorId] = GetLmID(BOS_);
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceStartArray[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
factorId = m_sentenceEnd->GetId();
lmIdMap[factorId] = GetLmID(EOS_);
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
m_lmIdLookup.resize(maxFactorId+1);
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);
map<size_t, VocabIndex>::iterator iterMap;
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
{
m_lmIdLookup[iterMap->first] = iterMap->second;
}
}
VocabIndex LanguageModelSRI::GetLmID( const std::string &str ) const
{
return m_srilmVocab->getIndex( str.c_str(), m_unknownId );
}
VocabIndex LanguageModelSRI::GetLmID( const Factor *factor ) const
{
size_t factorId = factor->GetId();
return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
}
float LanguageModelSRI::GetValue(VocabIndex wordId, VocabIndex *context) const
{
float p = m_srilmModel->wordProb( wordId, context );
return FloorScore(TransformSRIScore(p)); // log10->log
}
float LanguageModelSRI::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int *len) const
{
FactorType factorType = GetFactorType();
size_t count = contextFactor.size();
if (count <= 0)
{
finalState = NULL;
return 0;
}
// set up context
VocabIndex context[MAX_NGRAM_SIZE];
for (size_t i = 0 ; i < count - 1 ; i++)
{
context[i] = GetLmID((*contextFactor[count-2-i])[factorType]);
}
context[count-1] = Vocab_None;
assert((*contextFactor[count-1])[factorType] != NULL);
// call sri lm fn
VocabIndex lmId= GetLmID((*contextFactor[count-1])[factorType]);
float ret = GetValue(lmId, context);
if (finalState) {
for (int i = count - 2 ; i >= 0 ; i--)
context[i+1] = context[i];
context[0] = lmId;
unsigned int dummy;
if (!len) { len = &dummy; }
*finalState = m_srilmModel->contextID(context,*len);
(*len)++;
}
return ret;
}
}

65
src/LanguageModelSRI.h Normal file
View File

@ -0,0 +1,65 @@
// $Id: LanguageModelSRI.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelSRI_h
#define moses_LanguageModelSRI_h
#include <string>
#include <vector>
#include "Factor.h"
#include "TypeDef.h"
#include "Vocab.h"
#include "LanguageModelSingleFactor.h"
class Factor;
class Phrase;
class Ngram; // SRI forward decl
namespace Moses
{
class LanguageModelSRI : public LanguageModelSingleFactor
{
protected:
std::vector<VocabIndex> m_lmIdLookup;
Vocab *m_srilmVocab;
Ngram *m_srilmModel;
VocabIndex m_unknownId;
float GetValue(VocabIndex wordId, VocabIndex *context) const;
void CreateFactors();
VocabIndex GetLmID( const std::string &str ) const;
VocabIndex GetLmID( const Factor *factor ) const;
public:
LanguageModelSRI(bool registerScore, ScoreIndexManager &scoreIndexManager);
~LanguageModelSRI();
bool Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder);
virtual float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0, unsigned int* len = 0) const;
};
}
#endif

View File

@ -0,0 +1,60 @@
// $Id: LanguageModelSingleFactor.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cassert>
#include <limits>
#include <iostream>
#include <sstream>
#include "LanguageModelSingleFactor.h"
#include "TypeDef.h"
#include "Util.h"
#include "FactorCollection.h"
#include "Phrase.h"
#include "StaticData.h"
using namespace std;
namespace Moses
{
// static variable init
LanguageModelSingleFactor::State LanguageModelSingleFactor::UnknownState=0;
LanguageModelSingleFactor::LanguageModelSingleFactor(bool registerScore, ScoreIndexManager &scoreIndexManager)
:LanguageModel(registerScore, scoreIndexManager)
{
}
LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
std::string LanguageModelSingleFactor::GetScoreProducerDescription() const
{
std::ostringstream oss;
// what about LMs that are over multiple factors at once, POS + stem, for example?
oss << "LM_" << GetNGramOrder() << "gram";
return oss.str();
}
}

View File

@ -0,0 +1,87 @@
// $Id: LanguageModelSingleFactor.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelSingleFactor_h
#define moses_LanguageModelSingleFactor_h
#include "LanguageModel.h"
#include "Phrase.h"
namespace Moses
{
class FactorCollection;
class Factor;
//! Abstract class for for single factor LM
class LanguageModelSingleFactor : public LanguageModel
{
protected:
const Factor *m_sentenceStart, *m_sentenceEnd;
FactorType m_factorType;
LanguageModelSingleFactor(bool registerScore, ScoreIndexManager &scoreIndexManager);
public:
static State UnknownState;
virtual ~LanguageModelSingleFactor();
virtual bool Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder) = 0;
LMType GetLMType() const
{
return SingleFactor;
}
bool Useable(const Phrase &phrase) const
{
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
}
const Factor *GetSentenceStart() const
{
return m_sentenceStart;
}
const Factor *GetSentenceEnd() const
{
return m_sentenceEnd;
}
FactorType GetFactorType() const
{
return m_factorType;
}
float GetWeight() const
{
return m_weight;
}
void SetWeight(float weight)
{
m_weight = weight;
}
std::string GetScoreProducerDescription() const;
};
}
#endif

22
src/LanguageModelSkip.cpp Normal file
View File

@ -0,0 +1,22 @@
// $Id: LanguageModelSkip.cpp 916 2006-10-24 16:27:13Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "LanguageModelSkip.h"

129
src/LanguageModelSkip.h Normal file
View File

@ -0,0 +1,129 @@
// $Id: LanguageModelSkip.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_LanguageModelSkip_h
#define moses_LanguageModelSkip_h
#include <vector>
#include <algorithm>
#include "LanguageModelMultiFactor.h"
#include "LanguageModelSingleFactor.h"
#include "Phrase.h"
#include "FactorCollection.h"
namespace Moses
{
/* Hacked up LM which skips any factor with string '---'
* order of chunk hardcoded to 3 (m_realNGramOrder)
*/
class LanguageModelSkip : public LanguageModelSingleFactor
{
protected:
size_t m_realNGramOrder;
LanguageModelSingleFactor *m_lmImpl;
public:
/** Constructor
* \param lmImpl SRI or IRST LM which this LM can use to load data
*/
LanguageModelSkip(LanguageModelSingleFactor *lmImpl
, bool registerScore
, ScoreIndexManager &scoreIndexManager)
: LanguageModelSingleFactor(registerScore, scoreIndexManager)
{
m_lmImpl = lmImpl;
}
~LanguageModelSkip()
{
delete m_lmImpl;
}
bool Load(const std::string &filePath
, FactorType factorType
, float weight
, size_t nGramOrder)
{
m_factorType = factorType;
m_weight = weight;
m_filePath = filePath;
m_nGramOrder = nGramOrder;
m_realNGramOrder = 3;
FactorCollection &factorCollection = FactorCollection::Instance();
m_sentenceStartArray[m_factorType] = factorCollection.AddFactor(Output, m_factorType, BOS_);
m_sentenceEndArray[m_factorType] = factorCollection.AddFactor(Output, m_factorType, EOS_);
return m_lmImpl->Load(filePath, m_factorType, weight, nGramOrder);
}
float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL, unsigned int* len = NULL) const
{
if (contextFactor.size() == 0)
{
return 0;
}
// only process context where last word is a word we want
const Factor *factor = (*contextFactor.back())[m_factorType];
std::string strWord = factor->GetString();
if (strWord.find("---") == 0)
return 0;
// add last word
std::vector<const Word*> chunkContext;
Word* chunkWord = new Word;
chunkWord->SetFactor(m_factorType, factor);
chunkContext.push_back(chunkWord);
// create context in reverse 'cos we skip words we don't want
for (int currPos = (int)contextFactor.size() - 2 ; currPos >= 0 && chunkContext.size() < m_realNGramOrder ; --currPos )
{
const Word &word = *contextFactor[currPos];
factor = word[m_factorType];
std::string strWord = factor->GetString();
bool skip = strWord.find("---") == 0;
if (skip)
continue;
// add word to chunked context
Word* chunkWord = new Word;
chunkWord->SetFactor(m_factorType, factor);
chunkContext.push_back(chunkWord);
}
// create context factor the right way round
std::reverse(chunkContext.begin(), chunkContext.end());
// calc score on chunked phrase
float ret = m_lmImpl->GetValue(chunkContext, finalState, len);
RemoveAllInColl(chunkContext);
return ret;
}
};
}
#endif

269
src/LexicalReordering.cpp Normal file
View File

@ -0,0 +1,269 @@
#include "LexicalReordering.h"
#include "StaticData.h"
namespace Moses
{
LexicalReordering::LexicalReordering(const std::string &filePath,
const std::vector<float>& weights,
Direction direction,
Condition condition,
std::vector< FactorType >& f_factors,
std::vector< FactorType >& e_factors)
: m_NumScoreComponents(weights.size()), m_MaxContextLength(0)
{
std::cerr << "Creating lexical reordering...\n";
//add ScoreProducer
const_cast<ScoreIndexManager&>(StaticData::Instance().GetScoreIndexManager()).AddScoreProducer(this);
const_cast<StaticData&>(StaticData::Instance()).SetWeightsForScoreProducer(this, weights);
std::cerr << "weights: ";
for(size_t w = 0; w < weights.size(); ++w){
std::cerr << weights[w] << " ";
}
std::cerr << "\n";
m_Direction = DecodeDirection(direction);
m_Condition = DecodeCondition(condition);
//m_FactorsE = e_factors;
//m_FactorsF = f_factors;
//Todo:should check that
//- if condition contains e or c than e_factors non empty
//- if condition contains f f_factors non empty
for(size_t i = 0; i < m_Condition.size(); ++i){
switch(m_Condition[i]){
case E:
m_FactorsE = e_factors;
if(m_FactorsE.empty()){
//problem
std::cerr << "Problem e factor mask is unexpectedly empty\n";
}
break;
case F:
m_FactorsF = f_factors;
if(m_FactorsF.empty()){
//problem
std::cerr << "Problem f factor mask is unexpectedly empty\n";
}
break;
case C:
m_FactorsC = e_factors;
m_MaxContextLength = 1;
if(m_FactorsC.empty()){
//problem
std::cerr << "Problem c factor mask is unexpectedly empty\n";
}
break;
default:
//problem
std::cerr << "Unknown conditioning option!\n";
break;
}
}
if(weights.size() == m_Direction.size()){
m_OneScorePerDirection = true;
std::cerr << "Reordering types NOT individualy weighted!\n";
} else {
m_OneScorePerDirection = false;
}
m_Table = LexicalReorderingTable::LoadAvailable(filePath, m_FactorsF, m_FactorsE, m_FactorsC);
}
LexicalReordering::~LexicalReordering(){
if(m_Table){
delete m_Table;
}
}
std::vector<float> LexicalReordering::CalcScore(Hypothesis* hypothesis) const {
std::vector<float> score(GetNumScoreComponents(), 0);
std::vector<float> values;
//for every direction
for(size_t i = 0; i < m_Direction.size(); ++i){
//grab data
if(Forward == m_Direction[i]){
//relates to prev hypothesis as we dont know next phrase for current yet
//sanity check: is there a previous hypothesis?
if(0 == hypothesis->GetPrevHypo()->GetId()){
continue; //no score continue with next direction
}
//grab probs for prev hypothesis
const ScoreComponentCollection &reorderingScoreColl =
hypothesis->GetPrevHypo()->GetCachedReorderingScore();
values = reorderingScoreColl.GetScoresForProducer(this);
/*
values = m_Table->GetScore((hypothesis->GetPrevHypo()->GetSourcePhrase()).GetSubString(hypothesis->GetPrevHypo()->GetCurrSourceWordsRange()),
hypothesis->GetPrevHypo()->GetCurrTargetPhrase(),
auxGetContext(hypothesis->GetPrevHypo()));
*/
}
if(Backward == m_Direction[i])
{
const ScoreComponentCollection &reorderingScoreColl =
hypothesis->GetCachedReorderingScore();
values = reorderingScoreColl.GetScoresForProducer(this);
/*
values = m_Table->GetScore(hypothesis->GetSourcePhrase().GetSubString(hypothesis->GetCurrSourceWordsRange()),
hypothesis->GetCurrTargetPhrase(),
auxGetContext(hypothesis));
*/
}
//add score
//sanity check: do we have any probs?
assert(values.size() == (GetNumOrientationTypes() * m_Direction.size()));
OrientationType orientation = GetOrientationType(hypothesis);
float value = values[orientation + i * GetNumOrientationTypes()];
if(m_OneScorePerDirection){
//one score per direction
score[i] = value;
} else {
//one score per direction and orientation
score[orientation + i * GetNumOrientationTypes()] = value;
}
}
return score;
}
Phrase LexicalReordering::auxGetContext(const Hypothesis* hypothesis) const {
const Hypothesis* h = hypothesis;
Phrase c(Output);
if(0 == hypothesis->GetId()){
return c;
}
while(0 != hypothesis->GetPrevHypo()->GetId() && c.GetSize() < m_MaxContextLength){
hypothesis = hypothesis->GetPrevHypo();
int needed = m_MaxContextLength - c.GetSize();
const Phrase& p = hypothesis->GetCurrTargetPhrase();
Phrase tmp(Output);
if(needed > p.GetSize()){
//needed -= p.GetSize();
tmp = p;
} else {
WordsRange range(p.GetSize() - needed, p.GetSize()-1);
tmp = p.GetSubString(range);
}
//new code: new append returns void not this...
tmp.Append(c); c = tmp;
}
return c;
}
std::vector<LexicalReordering::Condition> LexicalReordering::DecodeCondition(LexicalReordering::Condition c){
std::vector<LexicalReordering::Condition> result;
switch(c){
case F:
case E:
case C:
result.push_back(c);
break;
case FE:
result.push_back(F);
result.push_back(E);
break;
case FEC:
result.push_back(F);
result.push_back(E);
result.push_back(C);
break;
}
return result;
}
std::vector<LexicalReordering::Direction> LexicalReordering::DecodeDirection(LexicalReordering::Direction d){
std::vector<Direction> result;
if(Bidirectional == d){
result.push_back(Backward);
result.push_back(Forward);
} else {
result.push_back(d);
}
return result;
}
LexicalReordering::OrientationType LexicalMonotonicReordering::GetOrientationType(Hypothesis* currHypothesis) const
{
const Hypothesis* prevHypothesis = currHypothesis->GetPrevHypo();
const WordsRange currWordsRange = currHypothesis->GetCurrSourceWordsRange();
//check if there is a previous hypo
if(0 == prevHypothesis->GetId()){
if(0 == currWordsRange.GetStartPos()){
return Monotone;
} else {
return NonMonotone;
}
} else {
const WordsRange prevWordsRange = prevHypothesis->GetCurrSourceWordsRange();
if(prevWordsRange.GetEndPos() == currWordsRange.GetStartPos()-1){
return Monotone;
} else {
return NonMonotone;
}
}
}
LexicalReordering::OrientationType LexicalOrientationReordering::GetOrientationType(Hypothesis* currHypothesis) const
{
const Hypothesis* prevHypothesis = currHypothesis->GetPrevHypo();
const WordsRange currWordsRange = currHypothesis->GetCurrSourceWordsRange();
//check if there is a previous hypo
if(0 == prevHypothesis->GetId()){
if(0 == currWordsRange.GetStartPos()){
return Monotone;
} else {
return Discontinuous;
}
} else {
const WordsRange prevWordsRange = prevHypothesis->GetCurrSourceWordsRange();
if(prevWordsRange.GetEndPos() == currWordsRange.GetStartPos()-1){
return Monotone;
} else if(prevWordsRange.GetStartPos() == currWordsRange.GetEndPos()+1) {
return Swap;
} else {
return Discontinuous;
}
}
}
LexicalReordering::OrientationType LexicalDirectionalReordering::GetOrientationType(Hypothesis* currHypothesis) const{
const Hypothesis* prevHypothesis = currHypothesis->GetPrevHypo();
const WordsRange currWordsRange = currHypothesis->GetCurrSourceWordsRange();
//check if there is a previous hypo
if(0 == prevHypothesis->GetId()){
return Right;
} else {
const WordsRange prevWordsRange = prevHypothesis->GetCurrSourceWordsRange();
if(prevWordsRange.GetEndPos() <= currWordsRange.GetStartPos()){
return Right;
} else {
return Left;
}
}
}
Score LexicalReordering::GetProb(const Phrase& f, const Phrase& e) const
{
return m_Table->GetScore(f, e, Phrase(Output));
}
FFState* LexicalReordering::Evaluate(
const Hypothesis& hypo,
const FFState* prev_state,
ScoreComponentCollection* out) const {
out->PlusEquals(this, CalcScore(const_cast<Hypothesis*>(&hypo)));
//TODO need to return proper state, calc score should not use previous
//hypothesis, it should use the state.
return NULL;
}
const FFState* LexicalReordering::EmptyHypothesisState() const {
return NULL;
}
}

159
src/LexicalReordering.h Normal file
View File

@ -0,0 +1,159 @@
#ifndef moses_LexicalReordering_h
#define moses_LexicalReordering_h
#include <string>
#include <vector>
#include "Factor.h"
#include "Phrase.h"
#include "TypeDef.h"
#include "Util.h"
#include "WordsRange.h"
#include "ScoreProducer.h"
#include "FeatureFunction.h"
#include "LexicalReorderingTable.h"
namespace Moses
{
class Factor;
class Phrase;
class Hypothesis;
class InputType;
using namespace std;
class LexicalReordering : public StatefulFeatureFunction {
public: //types & consts
typedef int OrientationType;
enum Direction {Forward, Backward, Bidirectional, Unidirectional = Backward};
enum Condition {F,E,C,FE,FEC};
public: //con- & destructors
LexicalReordering(const std::string &filePath,
const std::vector<float>& weights,
Direction direction,
Condition condition,
std::vector< FactorType >& f_factors,
std::vector< FactorType >& e_factors);
virtual ~LexicalReordering();
public: //interface
//inherited
virtual size_t GetNumScoreComponents() const {
return m_NumScoreComponents;
};
virtual FFState* Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
const FFState* EmptyHypothesisState() const;
virtual std::string GetScoreProducerDescription() const {
return "Generic Lexical Reordering Model... overwrite in subclass.";
};
std::string GetScoreProducerWeightShortName() const {
return "d";
};
//new
virtual int GetNumOrientationTypes() const = 0;
virtual OrientationType GetOrientationType(Hypothesis*) const = 0;
std::vector<float> CalcScore(Hypothesis* hypothesis) const;
void InitializeForInput(const InputType& i){
m_Table->InitializeForInput(i);
}
Score GetProb(const Phrase& f, const Phrase& e) const;
//helpers
static std::vector<Condition> DecodeCondition(Condition c);
static std::vector<Direction> DecodeDirection(Direction d);
private:
Phrase auxGetContext(const Hypothesis* hypothesis) const;
private:
LexicalReorderingTable* m_Table;
size_t m_NumScoreComponents;
std::vector< Direction > m_Direction;
std::vector< Condition > m_Condition;
bool m_OneScorePerDirection;
std::vector< FactorType > m_FactorsE, m_FactorsF, m_FactorsC;
int m_MaxContextLength;
};
class LexicalMonotonicReordering : public LexicalReordering {
private:
enum {Monotone = 0, NonMonotone = 1};
public:
LexicalMonotonicReordering(const std::string &filePath,
const std::vector<float>& w,
Direction direction,
Condition condition,
std::vector< FactorType >& f_factors,
std::vector< FactorType >& e_factors)
: LexicalReordering(filePath, w, direction, condition, f_factors, e_factors){
std::cerr << "Created lexical monotonic reordering\n";
}
public:
virtual int GetNumOrientationTypes() const {
return 2;
};
virtual std::string GetScoreProducerDescription() const {
return "MonotonicLexicalReorderingModel";
};
virtual int GetOrientationType(Hypothesis* currHypothesis) const;
};
class LexicalOrientationReordering : public LexicalReordering {
private:
enum {Monotone = 0, Swap = 1, Discontinuous = 2};
public:
LexicalOrientationReordering(const std::string &filePath,
const std::vector<float>& w,
Direction direction,
Condition condition,
std::vector< FactorType >& f_factors,
std::vector< FactorType >& e_factors)
: LexicalReordering(filePath, w, direction, condition, f_factors, e_factors){
std::cerr << "Created lexical orientation reordering\n";
}
public:
virtual int GetNumOrientationTypes() const {
return 3;
}
virtual std::string GetScoreProducerDescription() const {
return "OrientationLexicalReorderingModel";
};
virtual OrientationType GetOrientationType(Hypothesis* currHypothesis) const;
};
class LexicalDirectionalReordering : public LexicalReordering {
private:
enum {Left = 0, Right = 1};
public:
LexicalDirectionalReordering(const std::string &filePath,
const std::vector<float>& w,
Direction direction,
Condition condition,
std::vector< FactorType >& f_factors,
std::vector< FactorType >& e_factors)
: LexicalReordering(filePath, w, direction, condition, f_factors, e_factors){
std::cerr << "Created lexical directional Reordering\n";
}
public:
virtual int GetNumOrientationTypes() const {
return 2;
};
virtual std::string GetScoreProducerDescription() const {
return "DirectionalLexicalReorderingModel";
};
virtual OrientationType GetOrientationType(Hypothesis* currHypothesis) const;
};
}
#endif

View File

@ -0,0 +1,686 @@
#include "LexicalReorderingTable.h"
#include "InputFileStream.h"
//#include "LVoc.h" //need IPhrase
#include "StaticData.h"
#include "PhraseDictionary.h"
#include "GenerationDictionary.h"
#include "TargetPhrase.h"
#include "TargetPhraseCollection.h"
namespace Moses
{
/*
* local helper functions
*/
//cleans str of leading and tailing spaces
std::string auxClearString(const std::string& str){
int i = 0, j = str.size()-1;
while(i <= j){
if(' ' != str[i]){
break;
} else {
++i;
}
}
while(j >= i){
if(' ' != str[j]){
break;
} else {
--j;
}
}
return str.substr(i,j-i+1);
}
void auxAppend(IPhrase& head, const IPhrase& tail){
head.reserve(head.size()+tail.size());
for(size_t i = 0; i < tail.size(); ++i){
head.push_back(tail[i]);
}
}
/*
* functions for LexicalReorderingTable
*/
LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors){
//decide use Tree or Memory table
if(FileExists(filePath+".binlexr.idx")){
//there exists a binary version use that
return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);
} else {
//use plain memory
return new LexicalReorderingTableMemory(filePath, f_factors, e_factors, c_factors);
}
}
/*
* functions for LexicalReorderingTableMemory
*/
LexicalReorderingTableMemory::LexicalReorderingTableMemory(
const std::string& filePath,
const std::vector<FactorType>& f_factors,
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors)
: LexicalReorderingTable(f_factors, e_factors, c_factors)
{
LoadFromFile(filePath);
}
LexicalReorderingTableMemory::~LexicalReorderingTableMemory(){
}
std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f,
const Phrase& e,
const Phrase& c) {
//rather complicated because of const can't use []... as [] might enter new things into std::map
//also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large
TableType::const_iterator r;
std::string key;
if(0 == c.GetSize()){
key = MakeKey(f,e,c);
r = m_Table.find(key);
if(m_Table.end() != r){
return r->second;
}
} else {
//right try from large to smaller context
for(size_t i = 0; i <= c.GetSize(); ++i){
Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
key = MakeKey(f,e,sub_c);
r = m_Table.find(key);
if(m_Table.end() != r){
return r->second;
}
}
}
return Score();
}
void LexicalReorderingTableMemory::DbgDump(std::ostream* out) const{
TableType::const_iterator i;
for(i = m_Table.begin(); i != m_Table.end(); ++i){
*out << " key: '" << i->first << "' score: ";
*out << "(num scores: " << (i->second).size() << ")";
for(size_t j = 0; j < (i->second).size(); ++j){
*out << (i->second)[j] << " ";
}
*out << "\n";
}
};
std::string LexicalReorderingTableMemory::MakeKey(const Phrase& f,
const Phrase& e,
const Phrase& c) const {
/*
std::string key;
if(!m_FactorsF.empty()){
key += f.GetStringRep(m_FactorsF);
}
if(!m_FactorsE.empty()){
if(!key.empty()){
key += " ||| ";
}
key += e.GetStringRep(m_FactorsE);
}
*/
return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),
auxClearString(e.GetStringRep(m_FactorsE)),
auxClearString(c.GetStringRep(m_FactorsC)));
}
std::string LexicalReorderingTableMemory::MakeKey(const std::string& f,
const std::string& e,
const std::string& c) const{
std::string key;
if(!f.empty()){
key += f;
}
if(!m_FactorsE.empty()){
if(!key.empty()){
key += "|||";
}
key += e;
}
if(!m_FactorsC.empty()){
if(!key.empty()){
key += "|||";
}
key += c;
}
return key;
}
void LexicalReorderingTableMemory::LoadFromFile(const std::string& filePath){
std::string fileName = filePath;
if(!FileExists(fileName) && FileExists(fileName+".gz")){
fileName += ".gz";
}
InputFileStream file(fileName);
std::string line(""), key("");
int numScores = -1;
std::cerr << "Loading table into memory...";
while(!getline(file, line).eof()){
std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
int t = 0 ;
std::string f(""),e(""),c("");
if(!m_FactorsF.empty()){
//there should be something for f
f = auxClearString(tokens.at(t));
++t;
}
if(!m_FactorsE.empty()){
//there should be something for e
e = auxClearString(tokens.at(t));
++t;
}
if(!m_FactorsC.empty()){
//there should be something for c
c = auxClearString(tokens.at(t));
++t;
}
//last token are the probs
std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
//sanity check: all lines must have equall number of probs
if(-1 == numScores){
numScores = (int)p.size(); //set in first line
}
if((int)p.size() != numScores){
TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl);
exit(0);
}
std::transform(p.begin(),p.end(),p.begin(),TransformScore);
std::transform(p.begin(),p.end(),p.begin(),FloorScore);
//save it all into our map
m_Table[MakeKey(f,e,c)] = p;
}
std::cerr << "done.\n";
}
/*
* functions for LexicalReorderingTableTree
*/
LexicalReorderingTableTree::LexicalReorderingTableTree(
const std::string& filePath,
const std::vector<FactorType>& f_factors,
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors)
: LexicalReorderingTable(f_factors, e_factors, c_factors), m_UseCache(false), m_FilePath(filePath)
{
m_Table.reset(new PrefixTreeMap());
m_Table->Read(m_FilePath+".binlexr");
}
LexicalReorderingTableTree::~LexicalReorderingTableTree(){
}
Score LexicalReorderingTableTree::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) {
if( (!m_FactorsF.empty() && 0 == f.GetSize())
|| (!m_FactorsE.empty() && 0 == e.GetSize())){
//NOTE: no check for c as c might be empty, e.g. start of sentence
//not a proper key
// phi: commented out, since e may be empty (drop-unknown)
//std::cerr << "Not a proper key!\n";
return Score();
}
CacheType::iterator i;;
if(m_UseCache){
std::pair<CacheType::iterator, bool> r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
if(!r.second){
return auxFindScoreForContext((r.first)->second, c);
}
i = r.first;
} else if(!m_Cache.empty()) {
//although we might not be caching now, cache might be none empty!
i = m_Cache.find(MakeCacheKey(f,e));
if(i != m_Cache.end()){
return auxFindScoreForContext(i->second, c);
}
}
//not in cache go to file...
Score score;
Candidates cands;
m_Table->GetCandidates(MakeTableKey(f,e), &cands);
if(cands.empty()){
return Score();
}
if(m_FactorsC.empty()){
assert(1 == cands.size());
return cands[0].GetScore(0);
} else {
score = auxFindScoreForContext(cands, c);
}
//cache for future use
if(m_UseCache){
i->second = cands;
}
return score;
};
Score LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context){
if(m_FactorsC.empty()){
assert(cands.size() <= 1);
return (1 == cands.size())?(cands[0].GetScore(0)):(Score());
} else {
std::vector<std::string> cvec;
for(size_t i = 0; i < context.GetSize(); ++i){
/* old code
std::string s = context.GetWord(i).ToString(m_FactorsC);
cvec.push_back(s.substr(0,s.size()-1));
*/
cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
}
IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
IPhrase sub_c;
IPhrase::iterator start = c.begin();
for(size_t j = 0; j <= context.GetSize(); ++j, ++start){
sub_c.assign(start, c.end());
for(size_t cand = 0; cand < cands.size(); ++cand){
IPhrase p = cands[cand].GetPhrase(0);
if(cands[cand].GetPhrase(0) == sub_c){
return cands[cand].GetScore(0);
}
}
}
return Score();
}
}
/*
void LexicalReorderingTableTree::DbgDump(std::ostream* pout){
std::ostream& out = *pout;
//TODO!
}
*/
void LexicalReorderingTableTree::InitializeForInput(const InputType& input){
ClearCache();
if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)){
Cache(*cn);
} else if(Sentence const* s = dynamic_cast<Sentence const*>(&input)){
// Cache(*s); ... this just takes up too much memory, we cache elsewhere
DisableCache();
}
if (!m_Table.get()) {
//load thread specific table.
m_Table.reset(new PrefixTreeMap());
m_Table->Read(m_FilePath+".binlexr");
}
};
bool LexicalReorderingTableTree::Create(std::istream& inFile,
const std::string& outFileName){
std::string line;
//TRACE_ERR("Entering Create...\n");
std::string
ofn(outFileName+".binlexr.srctree"),
oft(outFileName+".binlexr.tgtdata"),
ofi(outFileName+".binlexr.idx"),
ofsv(outFileName+".binlexr.voc0"),
oftv(outFileName+".binlexr.voc1");
FILE *os = fOpen(ofn.c_str(),"wb");
FILE *ot = fOpen(oft.c_str(),"wb");
//TRACE_ERR("opend files....\n");
typedef PrefixTreeSA<LabelId,OFF_T> PSA;
PSA *psa = new PSA;
PSA::setDefault(InvalidOffT);
WordVoc* voc[3];
LabelId currFirstWord = InvalidLabelId;
IPhrase currKey;
Candidates cands;
std::vector<OFF_T> vo;
size_t lnc = 0;
size_t numTokens = 0;
size_t numKeyTokens = 0;
while(getline(inFile, line)){
//TRACE_ERR(lnc<<":"<<line<<"\n");
++lnc;
if(0 == lnc % 10000){
TRACE_ERR(".");
}
IPhrase key;
Score score;
std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
std::string w;
if(1 == lnc){
//do some init stuff in the first line
numTokens = tokens.size();
if(tokens.size() == 2){ //f ||| score
numKeyTokens = 1;
voc[0] = new WordVoc();
voc[1] = 0;
} else if(3 == tokens.size() || 4 == tokens.size()){ //either f ||| e ||| score or f ||| e ||| c ||| score
numKeyTokens = 2;
voc[0] = new WordVoc(); //f voc
voc[1] = new WordVoc(); //e voc
voc[2] = voc[1]; //c & e share voc
}
} else {
//sanity check ALL lines must have same number of tokens
assert(numTokens == tokens.size());
}
int phrase = 0;
for(; phrase < numKeyTokens; ++phrase){
//conditioned on more than just f... need |||
if(phrase >=1){
key.push_back(PrefixTreeMap::MagicWord);
}
std::istringstream is(tokens[phrase]);
while(is >> w) {
key.push_back(voc[phrase]->add(w));
}
}
//collect all non key phrases, i.e. c
std::vector<IPhrase> tgt_phrases;
tgt_phrases.resize(numTokens - numKeyTokens - 1);
for(int j = 0; j < tgt_phrases.size(); ++j, ++phrase){
std::istringstream is(tokens[numKeyTokens + j]);
while(is >> w) {
tgt_phrases[j].push_back(voc[phrase]->add(w));
}
}
//last token is score
std::istringstream is(tokens[numTokens-1]);
while(is >> w) {
score.push_back(atof(w.c_str()));
}
//transform score now...
std::transform(score.begin(),score.end(),score.begin(),TransformScore);
std::transform(score.begin(),score.end(),score.begin(),FloorScore);
std::vector<Score> scores;
scores.push_back(score);
if(key.empty()) {
TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
continue;
}
//first time inits
if(currFirstWord == InvalidLabelId){
currFirstWord = key[0];
}
if(currKey.empty()){
currKey = key;
//insert key into tree
assert(psa);
PSA::Data& d = psa->insert(key);
if(d == InvalidOffT) {
d = fTell(ot);
} else {
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
return false;
}
}
if(currKey != key){
//ok new key
currKey = key;
//a) write cands for old key
cands.writeBin(ot);
cands.clear();
//b) check if we need to move on to new tree root
if(key[0] != currFirstWord){
// write key prefix tree to file and clear
PTF pf;
if(currFirstWord >= vo.size()){
vo.resize(currFirstWord+1,InvalidOffT);
}
vo[currFirstWord] = fTell(os);
pf.create(*psa, os);
// clear
delete psa; psa = new PSA;
currFirstWord = key[0];
}
//c) insert key into tree
assert(psa);
PSA::Data& d = psa->insert(key);
if(d == InvalidOffT) {
d = fTell(ot);
} else {
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
return false;
}
}
cands.push_back(GenericCandidate(tgt_phrases, scores));
}
//flush remainders
cands.writeBin(ot);
cands.clear();
//process last currFirstWord
PTF pf;
if(currFirstWord >= vo.size()) {
vo.resize(currFirstWord+1,InvalidOffT);
}
vo[currFirstWord] = fTell(os);
pf.create(*psa,os);
delete psa;
psa=0;
fClose(os);
fClose(ot);
/*
std::vector<size_t> inv;
for(size_t i = 0; i < vo.size(); ++i){
if(vo[i] == InvalidOffT){
inv.push_back(i);
}
}
if(inv.size()) {
TRACE_ERR("WARNING: there are src voc entries with no phrase "
"translation: count "<<inv.size()<<"\n"
"There exists phrase translations for "<<vo.size()-inv.size()
<<" entries\n");
}
*/
FILE *oi = fOpen(ofi.c_str(),"wb");
fWriteVector(oi,vo);
fClose(oi);
if(voc[0]){
voc[0]->Write(ofsv);
delete voc[0];
}
if(voc[1]){
voc[1]->Write(oftv);
delete voc[1];
}
return true;
}
std::string LexicalReorderingTableTree::MakeCacheKey(const Phrase& f,
const Phrase& e) const {
std::string key;
if(!m_FactorsF.empty()){
key += auxClearString(f.GetStringRep(m_FactorsF));
}
if(!m_FactorsE.empty()){
if(!key.empty()){
key += "|||";
}
key += auxClearString(e.GetStringRep(m_FactorsE));
}
return key;
};
IPhrase LexicalReorderingTableTree::MakeTableKey(const Phrase& f,
const Phrase& e) const {
IPhrase key;
std::vector<std::string> keyPart;
if(!m_FactorsF.empty()){
for(int i = 0; i < f.GetSize(); ++i){
/* old code
std::string s = f.GetWord(i).ToString(m_FactorsF);
keyPart.push_back(s.substr(0,s.size()-1));
*/
keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
}
auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
keyPart.clear();
}
if(!m_FactorsE.empty()){
if(!key.empty()){
key.push_back(PrefixTreeMap::MagicWord);
}
for(int i = 0; i < e.GetSize(); ++i){
/* old code
std::string s = e.GetWord(i).ToString(m_FactorsE);
keyPart.push_back(s.substr(0,s.size()-1));
*/
keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
}
auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
//keyPart.clear();
}
return key;
};
struct State {
State(PPimp* t, const std::string& p) : pos(t), path(p){
}
PPimp* pos;
std::string path;
};
void LexicalReorderingTableTree::auxCacheForSrcPhrase(const Phrase& f){
if(m_FactorsE.empty()){
//f is all of key...
Candidates cands;
m_Table->GetCandidates(MakeTableKey(f,Phrase(Output)),&cands);
m_Cache[MakeCacheKey(f,Phrase(Output))] = cands;
} else {
ObjectPool<PPimp> pool;
PPimp* pPos = m_Table->GetRoot();
//1) goto subtree for f
for(int i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i){
/* old code
pPos = m_Table.Extend(pPos, auxClearString(f.GetWord(i).ToString(m_FactorsF)), SourceVocId);
*/
pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
}
if(0 != pPos && pPos->isValid()){
pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
}
if(0 == pPos || !pPos->isValid()){
return;
}
//2) explore whole subtree depth first & cache
std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";
std::vector<State> stack;
stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
Candidates cands;
while(!stack.empty()){
if(stack.back().pos->isValid()){
LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
//cache this
m_Table->GetCandidates(*stack.back().pos,&cands);
if(!cands.empty()){
m_Cache[cache_key + auxClearString(next_path)] = cands;
}
cands.clear();
PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
++stack.back().pos->idx;
stack.push_back(State(next_pos,next_path));
} else {
stack.pop_back();
}
}
}
}
void LexicalReorderingTableTree::Cache(const ConfusionNet& input){
return;
}
void LexicalReorderingTableTree::Cache(const Sentence& input){
//only works with sentences...
int prev_cache_size = m_Cache.size();
int max_phrase_length = input.GetSize();
for(size_t len = 0; len <= max_phrase_length; ++len){
for(size_t start = 0; start+len <= input.GetSize(); ++start){
Phrase f = input.GetSubString(WordsRange(start, start+len));
auxCacheForSrcPhrase(f);
}
}
std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n";
}
/*
Pre fetching implementation using Phrase and Generation Dictionaries
*//*
void LexicalReorderingTableTree::Cache(const ConfusionNet& input){
typedef TargetPhraseCollection::iterator Iter;
typedef TargetPhraseCollection::const_iterator ConstIter;
//not implemented for confusion networks...
Sentence const* s = dynamic_cast<Sentence const*>(&input);
if(!s){
return;
}
int max_phrase_length = input.GetSize();
std::vector<PhraseDictionaryBase*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries();
//new code:
//std::vector<PhraseDictionary*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries();
std::vector<GenerationDictionary*> GenTables = StaticData::Instance()->GetGenerationDictionaries();
for(size_t len = 1; len <= max_phrase_length; ++len){
for(size_t start = 0; start+len <= input.GetSize(); ++start){
Phrase f = s->GetSubString(WordsRange(start, start+len));
//find all translations of f
TargetPhraseCollection list;
for(size_t t = 0; t < PhraseTables.size(); ++t){
//if(doIntersect(PhraseTables[t]->GetOutputFactorMask(),FactorMask(m_FactorsE))){
//this table gives us something we need
const TargetPhraseCollection* new_list = PhraseTables[t]->GetTargetPhraseCollection(f);
TargetPhraseCollection curr_list;
for(ConstIter i = new_list->begin(); i != new_list->end(); ++i){
for(Iter j = list.begin(); j != list.end(); ++j){
curr_list.Add((*j)->MergeNext(*(*i)));
}
}
if(list.IsEmpty()){
list = *new_list;
} else {
list = curr_list;
}
//}
}
for(size_t g = 0; g < GenTables.size(); ++g){
//if(doIntersect(GenTables[g]->GetOutputFactorMask(),FactorMask(m_FactorsE))){
TargetPhraseCollection curr_list;
for(Iter j = list.begin(); j != list.end(); ++j){
for(size_t w = 0; w < (*j)->GetSize(); ++w){
const OutputWordCollection* words = GenTables[g]->FindWord((*j)->GetWord(w));
for(OutputWordCollection::const_iterator i = words->begin(); i != words->end(); ++i){
TargetPhrase* p = new TargetPhrase(*(*j));
Word& pw = p->GetWord(w);
pw.Merge(i->first);
curr_list.Add(p);
}
}
}
list = curr_list;
//}
}
//cache for each translation
for(Iter e = list.begin(); e < list.end(); ++e){
Candidates cands;
m_Table.GetCandidates(MakeTableKey(f,*(*e)), &cands);
m_Cache.insert(std::make_pair(MakeCacheKey(f,*(*e)),cands));
}
}
}
};
*/
}

View File

@ -0,0 +1,158 @@
#ifndef moses_LexicalReorderingTable_h
#define moses_LexicalReorderingTable_h
//stdlib dependencies:
#include <vector>
#include <map>
#include <memory>
#include <string>
#include <iostream>
#ifdef WITH_THREADS
#include <boost/thread/tss.hpp>
#endif
//moses dependencies:
#include "TypeDef.h"
#include "Phrase.h"
#include "InputType.h"
#include "ConfusionNet.h"
#include "Sentence.h"
#include "PrefixTreeMap.h"
namespace Moses
{
class Phrase;
class InputType;
class ConfusionNet;
//additional types
typedef std::vector<float> Score;
typedef std::vector<FactorType> FactorList;
class LexicalReorderingTable {
public:
LexicalReorderingTable(const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
: m_FactorsF(f_factors), m_FactorsE(e_factors), m_FactorsC(c_factors) {
}
virtual ~LexicalReorderingTable(){
}
public:
static LexicalReorderingTable* LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors);
public:
virtual Score GetScore(const Phrase& f, const Phrase& e, const Phrase& c) = 0;
virtual void InitializeForInput(const InputType&){
/* override for on-demand loading */
};
virtual void InitializeForInputPhrase(const Phrase&){
};
/*
int GetNumScoreComponents() const {
return m_NumScores;
}
*/
const FactorList& GetFFactorMask() const {
return m_FactorsF;
}
const FactorList& GetEFactorMask() const {
return m_FactorsE;
}
const FactorList& GetCFactorMask() const {
return m_FactorsC;
}
virtual void DbgDump(std::ostream* out) const{
*out << "Overwrite in subclass...\n";
};
protected:
FactorList m_FactorsF;
FactorList m_FactorsE;
FactorList m_FactorsC;
};
class LexicalReorderingTableMemory : public LexicalReorderingTable {
//implements LexicalReorderingTable saving all scores in one large std::map<> thingy
//to be used for non binary tables... uses a LOT of memory
public:
LexicalReorderingTableMemory( const std::string& filePath,
const std::vector<FactorType>& f_factors,
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors);
virtual ~LexicalReorderingTableMemory();
public:
virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
void DbgDump(std::ostream* out) const;
private:
std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
void LoadFromFile(const std::string& filePath);
private:
typedef std::map< std::string, std::vector<float> > TableType;
TableType m_Table;
};
class LexicalReorderingTableTree : public LexicalReorderingTable {
//implements LexicalReorderingTable using the crafty PDT code...
public:
LexicalReorderingTableTree(const std::string& filePath,
const std::vector<FactorType>& f_factors,
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors);
~LexicalReorderingTableTree();
public:
bool IsCacheEnabled() const {
return m_UseCache;
};
void EnableCache() {
m_UseCache = true;
};
void DisableCache() {
m_UseCache = false;
};
void ClearCache(){
if (m_UseCache) {
m_Cache.clear();
}
};
virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
virtual void InitializeForInput(const InputType& input);
virtual void InitializeForInputPhrase(const Phrase& f){
ClearCache();
auxCacheForSrcPhrase(f);
}
public:
static bool Create(std::istream& inFile, const std::string& outFileName);
private:
std::string MakeCacheKey(const Phrase& f, const Phrase& e) const;
IPhrase MakeTableKey(const Phrase& f, const Phrase& e) const;
void Cache(const ConfusionNet& input);
void Cache(const Sentence& input);
void auxCacheForSrcPhrase(const Phrase& f);
Score auxFindScoreForContext(const Candidates& cands, const Phrase& contex);
private:
//typedef LexicalReorderingCand CandType;
typedef std::map< std::string, Candidates > CacheType;
#ifdef WITH_THREADS
typedef boost::thread_specific_ptr<PrefixTreeMap> TableType;
#else
typedef std::auto_ptr<PrefixTreeMap> TableType;
#endif
static const int SourceVocId = 0;
static const int TargetVocId = 1;
bool m_UseCache;
std::string m_FilePath;
CacheType m_Cache;
TableType m_Table;
};
}
#endif

230
src/Makefile.am Normal file
View File

@ -0,0 +1,230 @@
lib_LTLIBRARIES = libmoses.la
AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS)
libmoses_ladir = ${includedir}
libmoses_la_HEADERS = \
TypeDef.h \
PrefixTree.h \
File.h \
FilePtr.h \
ObjectPool.h \
BitmapContainer.h \
ConfusionNet.h \
DecodeGraph.h \
DecodeStep.h \
DecodeStepGeneration.h \
DecodeStepTranslation.h \
Dictionary.h \
DummyScoreProducers.h \
Factor.h \
FactorCollection.h \
FactorTypeSet.h \
FeatureFunction.h \
FFState.h \
FloydWarshall.h \
GenerationDictionary.h \
GlobalLexicalModel.h \
hash.h \
Hypothesis.h \
HypothesisStack.h \
HypothesisStackCubePruning.h \
HypothesisStackNormal.h \
InputType.h \
InputFileStream.h \
LMList.h \
LVoc.h \
LanguageModel.h \
LanguageModelFactory.h \
LanguageModelInternal.h \
LanguageModelMultiFactor.h \
LanguageModelRemote.h \
LanguageModelSingleFactor.h \
LanguageModelSkip.h \
TrellisPath.h \
TrellisPathList.h \
TrellisPathCollection.h \
LexicalReordering.h \
LexicalReorderingTable.h \
Manager.h \
NGramCollection.h \
NGramNode.h \
PCNTools.h \
Parameter.h \
PartialTranslOptColl.h \
Phrase.h \
PhraseDictionary.h \
PhraseDictionaryMemory.h \
PhraseDictionaryNode.h \
PhraseDictionaryTree.h \
PhraseDictionaryTreeAdaptor.h \
PrefixTreeMap.h \
ReorderingConstraint.h \
ScoreComponentCollection.h \
ScoreIndexManager.h \
ScoreProducer.h \
Search.h \
SearchCubePruning.h \
SearchNormal.h \
Sentence.h \
SentenceStats.h \
SquareMatrix.h \
StaticData.h \
TargetPhrase.h \
TargetPhraseCollection.h \
Timer.h \
TranslationOption.h \
TranslationOptionCollection.h \
TranslationOptionCollectionText.h \
TranslationOptionCollectionConfusionNet.h \
TranslationOptionList.h \
UserMessage.h \
Util.h \
Word.h \
WordsBitmap.h \
WordLattice.h \
WordsRange.h \
XmlOption.h
if PROTOBUF
libmoses_la_HEADERS += rule.pb.h hypergraph.pb.h
endif
if SRI_LM
libmoses_la_HEADERS += LanguageModelSRI.h
endif
if IRST_LM
libmoses_la_HEADERS += LanguageModelIRST.h
endif
if RAND_LM
libmoses_la_HEADERS += LanguageModelRandLM.h
endif
if INTERNAL_LM
libmoses_la_HEADERS += LanguageModelInternal.h \
NGramCollection.h \
NGramNode.h
endif
libmoses_la_SOURCES = \
BitmapContainer.cpp \
ConfusionNet.cpp \
DecodeGraph.cpp \
DecodeStep.cpp \
DecodeStepGeneration.cpp \
DecodeStepTranslation.cpp \
Dictionary.cpp \
DummyScoreProducers.cpp \
Factor.cpp \
FactorCollection.cpp \
FactorTypeSet.cpp \
FeatureFunction.cpp \
FFState.cpp \
FloydWarshall.cpp \
GenerationDictionary.cpp \
GlobalLexicalModel.cpp \
hash.cpp \
Hypothesis.cpp \
HypothesisStack.cpp \
HypothesisStackCubePruning.cpp \
HypothesisStackNormal.cpp \
InputType.cpp \
InputFileStream.cpp \
LMList.cpp \
LVoc.cpp \
LanguageModel.cpp \
LanguageModelFactory.cpp \
LanguageModelInternal.cpp \
LanguageModelMultiFactor.cpp \
LanguageModelRemote.cpp \
LanguageModelSingleFactor.cpp \
LanguageModelSkip.cpp \
TrellisPath.cpp \
TrellisPathCollection.cpp \
LexicalReordering.cpp \
LexicalReorderingTable.cpp \
Manager.cpp \
NGramCollection.cpp \
NGramNode.cpp \
PCNTools.cpp \
Parameter.cpp \
PartialTranslOptColl.cpp \
Phrase.cpp \
PhraseDictionary.cpp \
PhraseDictionaryMemory.cpp \
PhraseDictionaryNode.cpp \
PhraseDictionaryTree.cpp \
PhraseDictionaryTreeAdaptor.cpp \
PrefixTreeMap.cpp \
ReorderingConstraint.cpp \
ScoreComponentCollection.cpp \
ScoreIndexManager.cpp \
ScoreProducer.cpp \
Search.cpp \
SearchCubePruning.cpp \
SearchNormal.cpp \
Sentence.cpp \
SentenceStats.cpp \
SquareMatrix.cpp \
StaticData.cpp \
TargetPhrase.cpp \
TargetPhraseCollection.cpp \
Timer.cpp \
TranslationOption.cpp \
TranslationOptionCollection.cpp \
TranslationOptionCollectionText.cpp \
TranslationOptionCollectionConfusionNet.cpp \
TranslationOptionList.cpp \
UserMessage.cpp \
Util.cpp \
Word.cpp \
WordsBitmap.cpp \
WordLattice.cpp \
WordsRange.cpp \
XmlOption.cpp
if PROTOBUF
BUILT_SOURCES = \
rule.pb.h \
rule.pb.cc \
hypergraph.pb.h \
hypergraph.pb.cc
CLEANFILES = $(BUILT_SOURCES)
SUFFIXES = .proto
rule.pb.cc: rule.proto
@PROTOC@ --cpp_out=. $<
rule.pb.h: rule.proto
@PROTOC@ --cpp_out=. $<
hypergraph.pb.cc: hypergraph.proto
@PROTOC@ --cpp_out=. $<
hypergraph.pb.h: hypergraph.proto
@PROTOC@ --cpp_out=. $<
libmoses_la_SOURCES += rule.pb.cc hypergraph.pb.cc
endif
if SRI_LM
libmoses_la_SOURCES += LanguageModelSRI.cpp
endif
if IRST_LM
libmoses_la_SOURCES += LanguageModelIRST.cpp
endif
if RAND_LM
libmoses_la_SOURCES += LanguageModelRandLM.cpp
endif
if INTERNAL_LM
libmoses_la_SOURCES += LanguageModelInternal.cpp \
NGramCollection.cpp \
NGramNode.cpp
endif
libmoses_la_LIBADD = $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB)

859
src/Manager.cpp Normal file
View File

@ -0,0 +1,859 @@
// $Id: Manager.cpp 2958 2010-03-08 16:30:31Z abarun $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifdef WIN32
#include <hash_set>
#else
#include <ext/hash_set>
#endif
#include <limits>
#include <cmath>
#include "Manager.h"
#include "TypeDef.h"
#include "Util.h"
#include "TargetPhrase.h"
#include "TrellisPath.h"
#include "TrellisPathCollection.h"
#include "TranslationOption.h"
#include "LMList.h"
#include "TranslationOptionCollection.h"
#include "DummyScoreProducers.h"
#if HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
#include "rule.pb.h"
#endif
using namespace std;
namespace Moses
{
Manager::Manager(InputType const& source, SearchAlgorithm searchAlgorithm)
:m_source(source)
,m_transOptColl(source.CreateTranslationOptionCollection())
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
,m_start(clock())
,interrupted_flag(0)
{
const StaticData &staticData = StaticData::Instance();
staticData.InitializeBeforeSentenceProcessing(source);
}
Manager::~Manager()
{
delete m_transOptColl;
delete m_search;
StaticData::Instance().CleanUpAfterSentenceProcessing();
clock_t end = clock();
float et = (end - m_start);
et /= (float)CLOCKS_PER_SEC;
VERBOSE(1, "Translation took " << et << " seconds" << endl);
VERBOSE(1, "Finished translating" << endl);
}
/**
* Main decoder loop that translates a sentence by expanding
* hypotheses stack by stack, until the end of the sentence.
*/
void Manager::ProcessSentence()
{
// reset statistics
const StaticData &staticData = StaticData::Instance();
ResetSentenceStats(m_source);
// collect translation options for this sentence
vector <DecodeGraph*>
decodeStepVL = staticData.GetDecodeStepVL(m_source);
m_transOptColl->CreateTranslationOptions(decodeStepVL);
// some reporting on how long this took
clock_t gotOptions = clock();
float et = (gotOptions - m_start);
IFVERBOSE(2) { GetSentenceStats().AddTimeCollectOpts( gotOptions - m_start ); }
et /= (float)CLOCKS_PER_SEC;
VERBOSE(1, "Collecting options took " << et << " seconds" << endl);
// search for best translation with the specified algorithm
m_search->ProcessSentence();
VERBOSE(1, "Search took " << ((clock()-m_start)/(float)CLOCKS_PER_SEC) << " seconds" << endl);
RemoveAllInColl(decodeStepVL);
}
/**
* Print all derivations in search graph. Note: The number of derivations is exponential in the sentence length
*
*/
void Manager::PrintAllDerivations(long translationId ) const
{
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
vector<const Hypothesis*> sortedPureHypo = hypoStackColl.back()->GetSortedList();
if (sortedPureHypo.size() == 0)
return;
float remainingScore = 0;
vector<const TargetPhrase*> remainingPhrases;
// add all pure paths
vector<const Hypothesis*>::const_iterator iterBestHypo;
for (iterBestHypo = sortedPureHypo.begin()
; iterBestHypo != sortedPureHypo.end()
; ++iterBestHypo)
{
printThisHypothesis(translationId, *iterBestHypo, remainingPhrases, remainingScore);
printDivergentHypothesis(translationId, *iterBestHypo, remainingPhrases, remainingScore);
}
}
void Manager::printDivergentHypothesis(long translationId, const Hypothesis* hypo, const vector <const TargetPhrase*> & remainingPhrases, float remainingScore ) const
{
//Backtrack from the predecessor
if (hypo->GetId() > 0) {
vector <const TargetPhrase*> followingPhrases;
followingPhrases.push_back(& (hypo->GetCurrTargetPhrase()));
///((Phrase) hypo->GetPrevHypo()->GetTargetPhrase());
followingPhrases.insert(followingPhrases.end()--, remainingPhrases.begin(), remainingPhrases.end());
printDivergentHypothesis(translationId, hypo->GetPrevHypo(), followingPhrases , remainingScore + hypo->GetScore() - hypo->GetPrevHypo()->GetScore());
}
//Process the arcs
const ArcList *pAL = hypo->GetArcList();
if (pAL) {
const ArcList &arcList = *pAL;
// every possible Arc to replace this edge
ArcList::const_iterator iterArc;
for (iterArc = arcList.begin() ; iterArc != arcList.end() ; ++iterArc)
{
const Hypothesis *loserHypo = *iterArc;
const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
float arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
vector <const TargetPhrase* > followingPhrases;
followingPhrases.push_back(&(loserHypo->GetCurrTargetPhrase()));
followingPhrases.insert(followingPhrases.end()--, remainingPhrases.begin(), remainingPhrases.end());
printThisHypothesis(translationId, loserPrevHypo, followingPhrases, remainingScore + arcScore);
printDivergentHypothesis(translationId, loserPrevHypo, followingPhrases, remainingScore + arcScore);
}
}
}
void Manager::printThisHypothesis(long translationId, const Hypothesis* hypo, const vector <const TargetPhrase*> & remainingPhrases, float remainingScore ) const
{
cerr << translationId << " ||| ";
//Yield of this hypothesis
hypo->ToStream(cerr);
for (size_t p = 0; p < remainingPhrases.size(); ++p) {
const TargetPhrase * phrase = remainingPhrases[p];
size_t size = phrase->GetSize();
for (size_t pos = 0 ; pos < size ; pos++)
{
const Factor *factor = phrase->GetFactor(pos, 0);
cerr << *factor;
cerr << " ";
}
}
cerr << "||| " << hypo->GetScore() + remainingScore;
cerr << endl;
}
/**
* After decoding, the hypotheses in the stacks and additional arcs
* form a search graph that can be mined for n-best lists.
* The heavy lifting is done in the TrellisPath and TrellisPathCollection
* this function controls this for one sentence.
*
* \param count the number of n-best translations to produce
* \param ret holds the n-best list that was calculated
*/
void Manager::CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct) const
{
if (count <= 0)
return;
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
vector<const Hypothesis*> sortedPureHypo = hypoStackColl.back()->GetSortedList();
if (sortedPureHypo.size() == 0)
return;
TrellisPathCollection contenders;
set<Phrase> distinctHyps;
// add all pure paths
vector<const Hypothesis*>::const_iterator iterBestHypo;
for (iterBestHypo = sortedPureHypo.begin()
; iterBestHypo != sortedPureHypo.end()
; ++iterBestHypo)
{
contenders.Add(new TrellisPath(*iterBestHypo));
}
// factor defines stopping point for distinct n-best list if too many candidates identical
size_t nBestFactor = StaticData::Instance().GetNBestFactor();
if (nBestFactor < 1) nBestFactor = 1000; // 0 = unlimited
// MAIN loop
for (size_t iteration = 0 ; (onlyDistinct ? distinctHyps.size() : ret.GetSize()) < count && contenders.GetSize() > 0 && (iteration < count * nBestFactor) ; iteration++)
{
// get next best from list of contenders
TrellisPath *path = contenders.pop();
assert(path);
if(onlyDistinct)
{
Phrase tgtPhrase = path->GetSurfacePhrase();
if (distinctHyps.insert(tgtPhrase).second)
ret.Add(path);
}
else
{
ret.Add(path);
}
// create deviations from current best
path->CreateDeviantPaths(contenders);
if(onlyDistinct)
{
const size_t nBestFactor = StaticData::Instance().GetNBestFactor();
if (nBestFactor > 0)
contenders.Prune(count * nBestFactor);
}
else
{
contenders.Prune(count);
}
}
}
void Manager::CalcDecoderStatistics() const
{
const Hypothesis *hypo = GetBestHypothesis();
if (hypo != NULL)
{
GetSentenceStats().CalcFinalStats(*hypo);
IFVERBOSE(2) {
if (hypo != NULL) {
string buff;
string buff2;
TRACE_ERR( "Source and Target Units:"
<< hypo->GetInput());
buff2.insert(0,"] ");
buff2.insert(0,(hypo->GetCurrTargetPhrase()).ToString());
buff2.insert(0,":");
buff2.insert(0,(hypo->GetCurrSourceWordsRange()).ToString());
buff2.insert(0,"[");
hypo = hypo->GetPrevHypo();
while (hypo != NULL) {
//dont print out the empty final hypo
buff.insert(0,buff2);
buff2.clear();
buff2.insert(0,"] ");
buff2.insert(0,(hypo->GetCurrTargetPhrase()).ToString());
buff2.insert(0,":");
buff2.insert(0,(hypo->GetCurrSourceWordsRange()).ToString());
buff2.insert(0,"[");
hypo = hypo->GetPrevHypo();
}
TRACE_ERR( buff << endl);
}
}
}
}
void OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo, size_t &linkId)
{
const StaticData &staticData = StaticData::Instance();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
outputWordGraphStream << "J=" << linkId++
<< "\tS=" << prevHypo->GetId()
<< "\tE=" << hypo->GetId()
<< "\ta=";
// phrase table scores
const std::vector<PhraseDictionaryFeature*> &phraseTables = staticData.GetPhraseDictionaries();
std::vector<PhraseDictionaryFeature*>::const_iterator iterPhraseTable;
for (iterPhraseTable = phraseTables.begin() ; iterPhraseTable != phraseTables.end() ; ++iterPhraseTable)
{
const PhraseDictionaryFeature *phraseTable = *iterPhraseTable;
vector<float> scores = hypo->GetScoreBreakdown().GetScoresForProducer(phraseTable);
outputWordGraphStream << scores[0];
vector<float>::const_iterator iterScore;
for (iterScore = ++scores.begin() ; iterScore != scores.end() ; ++iterScore)
{
outputWordGraphStream << ", " << *iterScore;
}
}
// language model scores
outputWordGraphStream << "\tl=";
const LMList &lmList = staticData.GetAllLM();
LMList::const_iterator iterLM;
for (iterLM = lmList.begin() ; iterLM != lmList.end() ; ++iterLM)
{
LanguageModel *lm = *iterLM;
vector<float> scores = hypo->GetScoreBreakdown().GetScoresForProducer(lm);
outputWordGraphStream << scores[0];
vector<float>::const_iterator iterScore;
for (iterScore = ++scores.begin() ; iterScore != scores.end() ; ++iterScore)
{
outputWordGraphStream << ", " << *iterScore;
}
}
// re-ordering
outputWordGraphStream << "\tr=";
outputWordGraphStream << hypo->GetScoreBreakdown().GetScoreForProducer(staticData.GetDistortionScoreProducer());
// lexicalised re-ordering
const std::vector<LexicalReordering*> &lexOrderings = staticData.GetReorderModels();
std::vector<LexicalReordering*>::const_iterator iterLexOrdering;
for (iterLexOrdering = lexOrderings.begin() ; iterLexOrdering != lexOrderings.end() ; ++iterLexOrdering)
{
LexicalReordering *lexicalReordering = *iterLexOrdering;
vector<float> scores = hypo->GetScoreBreakdown().GetScoresForProducer(lexicalReordering);
outputWordGraphStream << scores[0];
vector<float>::const_iterator iterScore;
for (iterScore = ++scores.begin() ; iterScore != scores.end() ; ++iterScore)
{
outputWordGraphStream << ", " << *iterScore;
}
}
// words !!
outputWordGraphStream << "\tw=" << hypo->GetCurrTargetPhrase();
outputWordGraphStream << endl;
}
void Manager::GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const
{
const StaticData &staticData = StaticData::Instance();
string fileName = staticData.GetParam("output-word-graph")[0];
bool outputNBest = Scan<bool>(staticData.GetParam("output-word-graph")[1]);
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
outputWordGraphStream << "VERSION=1.0" << endl
<< "UTTERANCE=" << translationId << endl;
size_t linkId = 0;
size_t stackNo = 1;
std::vector < HypothesisStack* >::const_iterator iterStack;
for (iterStack = ++hypoStackColl.begin() ; iterStack != hypoStackColl.end() ; ++iterStack)
{
cerr << endl << stackNo++ << endl;
const HypothesisStack &stack = **iterStack;
HypothesisStack::const_iterator iterHypo;
for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
OutputWordGraph(outputWordGraphStream, hypo, linkId);
if (outputNBest)
{
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
OutputWordGraph(outputWordGraphStream, loserHypo, linkId);
}
}
} //if (outputNBest)
} //for (iterHypo
} // for (iterStack
}
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const Hypothesis *hypo, const Hypothesis *recombinationHypo, int forward, double fscore)
{
const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
bool extendedFormat = StaticData::Instance().GetOutputSearchGraphExtended();
outputSearchGraphStream << translationId;
// special case: initial hypothesis
if ( hypo->GetId() == 0 )
{
outputSearchGraphStream << " hyp=0 stack=0";
if (!extendedFormat)
{
outputSearchGraphStream << " forward=" << forward << " fscore=" << fscore;
}
outputSearchGraphStream << endl;
return;
}
const Hypothesis *prevHypo = hypo->GetPrevHypo();
// output in traditional format
if (!extendedFormat)
{
outputSearchGraphStream << " hyp=" << hypo->GetId()
<< " stack=" << hypo->GetWordsBitmap().GetNumWordsCovered()
<< " back=" << prevHypo->GetId()
<< " score=" << hypo->GetScore()
<< " transition=" << (hypo->GetScore() - prevHypo->GetScore());
if (recombinationHypo != NULL)
outputSearchGraphStream << " recombined=" << recombinationHypo->GetId();
outputSearchGraphStream << " forward=" << forward << " fscore=" << fscore
<< " covered=" << hypo->GetCurrSourceWordsRange().GetStartPos()
<< "-" << hypo->GetCurrSourceWordsRange().GetEndPos()
<< " out=" << hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder)
<< endl;
return;
}
// output in extended format
if (recombinationHypo != NULL)
outputSearchGraphStream << " hyp=" << recombinationHypo->GetId();
else
outputSearchGraphStream << " hyp=" << hypo->GetId();
outputSearchGraphStream << " back=" << prevHypo->GetId();
ScoreComponentCollection scoreBreakdown = hypo->GetScoreBreakdown();
scoreBreakdown.MinusEquals( prevHypo->GetScoreBreakdown() );
outputSearchGraphStream << " [ ";
StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown );
outputSearchGraphStream << " ]";
outputSearchGraphStream << " out=" << hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
}
void Manager::GetConnectedGraph(
std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList) const {
std::map < int, bool >& connected = *pConnected;
std::vector< const Hypothesis *>& connectedList = *pConnectedList;
// start with the ones in the final stack
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
const HypothesisStack &finalStack = *hypoStackColl.back();
HypothesisStack::const_iterator iterHypo;
for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
connected[ hypo->GetId() ] = true;
connectedList.push_back( hypo );
}
// move back from known connected hypotheses
for(size_t i=0; i<connectedList.size(); i++) {
const Hypothesis *hypo = connectedList[i];
// add back pointer
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo->GetId() > 0 // don't add empty hypothesis
&& connected.find( prevHypo->GetId() ) == connected.end()) // don't add already added
{
connected[ prevHypo->GetId() ] = true;
connectedList.push_back( prevHypo );
}
// add arcs
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
if (connected.find( loserHypo->GetId() ) == connected.end()) // don't add already added
{
connected[ loserHypo->GetId() ] = true;
connectedList.push_back( loserHypo );
}
}
}
}
}
void Manager::GetWinnerConnectedGraph(
std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList) const {
std::map < int, bool >& connected = *pConnected;
std::vector< const Hypothesis *>& connectedList = *pConnectedList;
// start with the ones in the final stack
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
const HypothesisStack &finalStack = *hypoStackColl.back();
HypothesisStack::const_iterator iterHypo;
for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
connected[ hypo->GetId() ] = true;
connectedList.push_back( hypo );
}
// move back from known connected hypotheses
for(size_t i=0; i<connectedList.size(); i++) {
const Hypothesis *hypo = connectedList[i];
// add back pointer
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo->GetId() > 0 // don't add empty hypothesis
&& connected.find( prevHypo->GetId() ) == connected.end()) // don't add already added
{
connected[ prevHypo->GetId() ] = true;
connectedList.push_back( prevHypo );
}
// add arcs
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
if (connected.find( loserHypo->GetPrevHypo()->GetId() ) == connected.end() && loserHypo->GetPrevHypo()->GetId() > 0) // don't add already added & don't add hyp 0
{
connected[ loserHypo->GetPrevHypo()->GetId() ] = true;
connectedList.push_back( loserHypo->GetPrevHypo() );
}
}
}
}
}
#ifdef HAVE_PROTOBUF
void SerializeEdgeInfo(const Hypothesis* hypo, hgmert::Hypergraph_Edge* edge) {
hgmert::Rule* rule = edge->mutable_rule();
hypo->GetCurrTargetPhrase().WriteToRulePB(rule);
const Hypothesis* prev = hypo->GetPrevHypo();
// if the feature values are empty, they default to 0
if (!prev) return;
// score breakdown is an aggregate (forward) quantity, but the exported
// graph object just wants the feature values on the edges
const ScoreComponentCollection& scores = hypo->GetScoreBreakdown();
const ScoreComponentCollection& pscores = prev->GetScoreBreakdown();
for (unsigned int i = 0; i < scores.size(); ++i)
edge->add_feature_values((scores[i] - pscores[i]) * -1.0);
}
hgmert::Hypergraph_Node* GetHGNode(
const Hypothesis* hypo,
std::map< int, int>* i2hgnode,
hgmert::Hypergraph* hg,
int* hgNodeIdx) {
hgmert::Hypergraph_Node* hgnode;
std::map < int, int >::iterator idxi = i2hgnode->find(hypo->GetId());
if (idxi == i2hgnode->end()) {
*hgNodeIdx = ((*i2hgnode)[hypo->GetId()] = hg->nodes_size());
hgnode = hg->add_nodes();
} else {
*hgNodeIdx = idxi->second;
hgnode = hg->mutable_nodes(*hgNodeIdx);
}
return hgnode;
}
void Manager::SerializeSearchGraphPB(
long translationId,
std::ostream& outputStream) const {
using namespace hgmert;
std::map < int, bool > connected;
std::map < int, int > i2hgnode;
std::vector< const Hypothesis *> connectedList;
GetConnectedGraph(&connected, &connectedList);
connected[ 0 ] = true;
Hypergraph hg;
hg.set_is_sorted(false);
int num_feats = (*m_search->GetHypothesisStacks().back()->begin())->GetScoreBreakdown().size();
hg.set_num_features(num_feats);
StaticData::Instance().GetScoreIndexManager().SerializeFeatureNamesToPB(&hg);
Hypergraph_Node* goal = hg.add_nodes(); // idx=0 goal node must have idx 0
Hypergraph_Node* source = hg.add_nodes(); // idx=1
i2hgnode[-1] = 1; // source node
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
const HypothesisStack &finalStack = *hypoStackColl.back();
for (std::vector < HypothesisStack* >::const_iterator iterStack = hypoStackColl.begin();
iterStack != hypoStackColl.end() ; ++iterStack)
{
const HypothesisStack &stack = **iterStack;
HypothesisStack::const_iterator iterHypo;
for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
bool is_goal = hypo->GetWordsBitmap().IsComplete();
if (connected.find( hypo->GetId() ) != connected.end())
{
int headNodeIdx;
Hypergraph_Node* headNode = GetHGNode(hypo, &i2hgnode, &hg, &headNodeIdx);
if (is_goal) {
Hypergraph_Edge* ge = hg.add_edges();
ge->set_head_node(0); // goal
ge->add_tail_nodes(headNodeIdx);
ge->mutable_rule()->add_trg_words("[X,1]");
}
Hypergraph_Edge* edge = hg.add_edges();
SerializeEdgeInfo(hypo, edge);
edge->set_head_node(headNodeIdx);
const Hypothesis* prev = hypo->GetPrevHypo();
int tailNodeIdx = 1; // source
if (prev)
tailNodeIdx = i2hgnode.find(prev->GetId())->second;
edge->add_tail_nodes(tailNodeIdx);
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
assert(connected[loserHypo->GetId()]);
Hypergraph_Edge* edge = hg.add_edges();
SerializeEdgeInfo(loserHypo, edge);
edge->set_head_node(headNodeIdx);
tailNodeIdx = i2hgnode.find(loserHypo->GetPrevHypo()->GetId())->second;
edge->add_tail_nodes(tailNodeIdx);
}
} // end if arcList empty
} // end if connected
} // end for iterHypo
} // end for iterStack
hg.SerializeToOstream(&outputStream);
}
#endif
void Manager::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const
{
std::map < int, bool > connected;
std::map < int, int > forward;
std::map < int, double > forwardScore;
// *** find connected hypotheses ***
std::vector< const Hypothesis *> connectedList;
GetConnectedGraph(&connected, &connectedList);
// ** compute best forward path for each hypothesis *** //
// forward cost of hypotheses on final stack is 0
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
const HypothesisStack &finalStack = *hypoStackColl.back();
HypothesisStack::const_iterator iterHypo;
for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
forwardScore[ hypo->GetId() ] = 0.0f;
forward[ hypo->GetId() ] = -1;
}
// compete for best forward score of previous hypothesis
std::vector < HypothesisStack* >::const_iterator iterStack;
for (iterStack = --hypoStackColl.end() ; iterStack != hypoStackColl.begin() ; --iterStack)
{
const HypothesisStack &stack = **iterStack;
HypothesisStack::const_iterator iterHypo;
for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
if (connected.find( hypo->GetId() ) != connected.end())
{
// make a play for previous hypothesis
const Hypothesis *prevHypo = hypo->GetPrevHypo();
double fscore = forwardScore[ hypo->GetId() ] +
hypo->GetScore() - prevHypo->GetScore();
if (forwardScore.find( prevHypo->GetId() ) == forwardScore.end()
|| forwardScore.find( prevHypo->GetId() )->second < fscore)
{
forwardScore[ prevHypo->GetId() ] = fscore;
forward[ prevHypo->GetId() ] = hypo->GetId();
}
// all arcs also make a play
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
// make a play
const Hypothesis *loserPrevHypo = loserHypo->GetPrevHypo();
double fscore = forwardScore[ hypo->GetId() ] +
loserHypo->GetScore() - loserPrevHypo->GetScore();
if (forwardScore.find( loserPrevHypo->GetId() ) == forwardScore.end()
|| forwardScore.find( loserPrevHypo->GetId() )->second < fscore)
{
forwardScore[ loserPrevHypo->GetId() ] = fscore;
forward[ loserPrevHypo->GetId() ] = loserHypo->GetId();
}
} // end for arc list
} // end if arc list empty
} // end if hypo connected
} // end for hypo
} // end for stack
// *** output all connected hypotheses *** //
connected[ 0 ] = true;
for (iterStack = hypoStackColl.begin() ; iterStack != hypoStackColl.end() ; ++iterStack)
{
const HypothesisStack &stack = **iterStack;
HypothesisStack::const_iterator iterHypo;
for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
if (connected.find( hypo->GetId() ) != connected.end())
{
OutputSearchGraph(translationId, outputSearchGraphStream, hypo, NULL, forward[ hypo->GetId() ], forwardScore[ hypo->GetId() ]);
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
OutputSearchGraph(translationId, outputSearchGraphStream, loserHypo, hypo, forward[ hypo->GetId() ], forwardScore[ hypo->GetId() ]);
}
} // end if arcList empty
} // end if connected
} // end for iterHypo
} // end for iterStack
}
void Manager::GetForwardBackwardSearchGraph(std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList, std::map < const Hypothesis*, set< const Hypothesis* > >* pOutgoingHyps, vector< float>* pFwdBwdScores) const
{
std::map < int, bool > &connected = *pConnected;
std::vector< const Hypothesis *>& connectedList = *pConnectedList;
std::map < int, int > forward;
std::map < int, double > forwardScore;
std::map < const Hypothesis*, set <const Hypothesis*> > & outgoingHyps = *pOutgoingHyps;
vector< float> & estimatedScores = *pFwdBwdScores;
// *** find connected hypotheses ***
GetWinnerConnectedGraph(&connected, &connectedList);
// ** compute best forward path for each hypothesis *** //
// forward cost of hypotheses on final stack is 0
const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
const HypothesisStack &finalStack = *hypoStackColl.back();
HypothesisStack::const_iterator iterHypo;
for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
forwardScore[ hypo->GetId() ] = 0.0f;
forward[ hypo->GetId() ] = -1;
}
// compete for best forward score of previous hypothesis
std::vector < HypothesisStack* >::const_iterator iterStack;
for (iterStack = --hypoStackColl.end() ; iterStack != hypoStackColl.begin() ; --iterStack)
{
const HypothesisStack &stack = **iterStack;
HypothesisStack::const_iterator iterHypo;
for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo)
{
const Hypothesis *hypo = *iterHypo;
if (connected.find( hypo->GetId() ) != connected.end())
{
// make a play for previous hypothesis
const Hypothesis *prevHypo = hypo->GetPrevHypo();
double fscore = forwardScore[ hypo->GetId() ] +
hypo->GetScore() - prevHypo->GetScore();
if (forwardScore.find( prevHypo->GetId() ) == forwardScore.end()
|| forwardScore.find( prevHypo->GetId() )->second < fscore)
{
forwardScore[ prevHypo->GetId() ] = fscore;
forward[ prevHypo->GetId() ] = hypo->GetId();
}
//store outgoing info
outgoingHyps[prevHypo].insert(hypo);
// all arcs also make a play
const ArcList *arcList = hypo->GetArcList();
if (arcList != NULL)
{
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList)
{
const Hypothesis *loserHypo = *iterArcList;
// make a play
const Hypothesis *loserPrevHypo = loserHypo->GetPrevHypo();
double fscore = forwardScore[ hypo->GetId() ] +
loserHypo->GetScore() - loserPrevHypo->GetScore();
if (forwardScore.find( loserPrevHypo->GetId() ) == forwardScore.end()
|| forwardScore.find( loserPrevHypo->GetId() )->second < fscore)
{
forwardScore[ loserPrevHypo->GetId() ] = fscore;
forward[ loserPrevHypo->GetId() ] = loserHypo->GetId();
}
//store outgoing info
outgoingHyps[loserPrevHypo].insert(hypo);
} // end for arc list
} // end if arc list empty
} // end if hypo connected
} // end for hypo
} // end for stack
for (std::vector< const Hypothesis *>::iterator it = connectedList.begin(); it != connectedList.end(); ++it) {
float estimatedScore = (*it)->GetScore() + forwardScore[(*it)->GetId()];
estimatedScores.push_back(estimatedScore);
}
}
const Hypothesis *Manager::GetBestHypothesis() const
{
return m_search->GetBestHypothesis();
}
}

141
src/Manager.h Normal file
View File

@ -0,0 +1,141 @@
// $Id: Manager.h 2957 2010-03-08 15:28:40Z abarun $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Manager_h
#define moses_Manager_h
#include <vector>
#include <list>
#include <ctime>
#include "InputType.h"
#include "Hypothesis.h"
#include "StaticData.h"
#include "TranslationOption.h"
#include "TranslationOptionCollection.h"
#include "TrellisPathList.h"
#include "SquareMatrix.h"
#include "WordsBitmap.h"
#include "Search.h"
#include "SearchCubePruning.h"
#if HAVE_CONFIG_H
#include "config.h"
#endif
namespace Moses
{
class TrellisPath;
class TranslationOptionCollection;
/** The Manager class implements a stack decoding algorithm.
* Hypotheses are organized in stacks. One stack contains all hypothesis that have
* the same number of foreign words translated. The data structure for hypothesis
* stacks is the class HypothesisStack. The data structure for a hypothesis
* is the class Hypothesis.
*
* The main decoder loop in the function ProcessSentence() consists of the steps:
* - Create the list of possible translation options. In phrase-based decoding
* (and also the first mapping step in the factored model) is a phrase translation
* from the source to the target. Given a specific input sentence, only a limited
* number of phrase translation can be applied. For efficient lookup of the
* translation options later, these optuions are first collected in the function
* CreateTranslationOption (for more information check the class
* TranslationOptionCollection)
* - Create initial hypothesis: Hypothesis stack 0 contains only one empty hypothesis.
* - Going through stacks 0 ... (sentence_length-1):
* - The stack is pruned to the maximum size
* - Going through all hypotheses in the stack
* - Each hypothesis is expanded by ProcessOneHypothesis()
* - Expansion means applying a translation option to the hypothesis to create
* new hypotheses
* - What translation options may be applied depends on reordering limits and
* overlap with already translated words
* - With a applicable translation option and a hypothesis at hand, a new
* hypothesis can be created in ExpandHypothesis()
* - New hypothesis are either discarded (because they are too bad), added to
* the appropriate stack, or re-combined with existing hypotheses
**/
class Manager
{
Manager();
Manager(Manager const&);
void operator=(Manager const&);
protected:
// data
InputType const& m_source; /**< source sentence to be translated */
TranslationOptionCollection *m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
Search *m_search;
HypothesisStack* actual_hypoStack; /**actual (full expanded) stack of hypotheses*/
clock_t m_start; /**< starting time, used for logging */
size_t interrupted_flag;
void GetConnectedGraph(
std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList) const;
void GetWinnerConnectedGraph(
std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList) const;
public:
Manager(InputType const& source, SearchAlgorithm searchAlgorithm);
~Manager();
void ProcessSentence();
const Hypothesis *GetBestHypothesis() const;
const Hypothesis *GetActualBestHypothesis() const;
void CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct=0) const;
void PrintAllDerivations(long translationId) const;
void printDivergentHypothesis(long translationId, const Hypothesis* hypo, const std::vector <const TargetPhrase*> & remainingPhrases, float remainingScore ) const;
void printThisHypothesis(long translationId, const Hypothesis* hypo, const std::vector <const TargetPhrase* > & remainingPhrases, float remainingScore ) const;
void GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const;
#ifdef HAVE_PROTOBUF
void SerializeSearchGraphPB(long translationId, std::ostream& outputStream) const;
#endif
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
const InputType& GetSource() const {return m_source;}
/***
* to be called after processing a sentence (which may consist of more than just calling ProcessSentence() )
*/
void CalcDecoderStatistics() const;
void ResetSentenceStats(const InputType& source)
{
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
}
SentenceStats& GetSentenceStats() const
{
return *m_sentenceStats;
}
/***
*For Lattice MBR
*/
void GetForwardBackwardSearchGraph(std::map< int, bool >* pConnected,
std::vector< const Hypothesis* >* pConnectedList, std::map < const Hypothesis*, set < const Hypothesis* > >* pOutgoingHyps, vector< float>* pFwdBwdScores) const;
std::auto_ptr<SentenceStats> m_sentenceStats;
};
}
#endif

67
src/NGramCollection.cpp Normal file
View File

@ -0,0 +1,67 @@
// $Id: NGramCollection.cpp 1897 2008-10-08 23:51:26Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "NGramCollection.h"
#include "NGramNode.h"
namespace Moses
{
NGramCollection::~NGramCollection()
{
Collection::iterator iter;
for (iter = m_collection.begin() ; iter != m_collection.end() ; ++iter)
{
delete (iter->second);
}
}
void NGramCollection::Add(const Factor *factor, const NGramNode &ngramNode)
{
}
NGramNode *NGramCollection::GetOrCreateNGram(const Factor *factor)
{
Collection::iterator iter = m_collection.find(factor);
if (iter == m_collection.end())
{
return (m_collection[factor] = new NGramNode());
}
else
{
return (iter->second);
}
}
NGramNode *NGramCollection::GetNGram(const Factor *factor)
{
Collection::iterator iter = m_collection.find(factor);
return (iter == m_collection.end()) ? NULL : (iter->second) ;
}
const NGramNode *NGramCollection::GetNGram(const Factor *factor) const
{
Collection::const_iterator iter = m_collection.find(factor);
return (iter == m_collection.end()) ? NULL : (iter->second) ;
}
}

57
src/NGramCollection.h Normal file
View File

@ -0,0 +1,57 @@
// $Id: NGramCollection.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_NGramCollection_h
#define moses_NGramCollection_h
#include <map>
#include <vector>
#include "NGramNode.h"
namespace Moses
{
class Factor;
typedef std::vector<const Factor*> FactorVector;
class NGramCollection
{
protected:
typedef std::map<const Factor*, NGramNode*> Collection;
Collection m_collection;
void Add(const Factor *factor, const NGramNode &ngramNode);
public:
NGramCollection()
{
}
~NGramCollection();
NGramNode *GetOrCreateNGram(const Factor *factor);
NGramNode *GetNGram(const Factor *factor);
const NGramNode *GetNGram(const Factor *factor) const;
};
}
#endif

26
src/NGramNode.cpp Normal file
View File

@ -0,0 +1,26 @@
#include "NGramNode.h"
#include "NGramCollection.h"
namespace Moses
{
NGramNode::NGramNode()
{
m_map = new NGramCollection();
}
NGramNode::~NGramNode()
{
delete m_map;
}
const NGramNode *NGramNode::GetNGram(const Factor *factor) const
{
return m_map->GetNGram(factor);
}
NGramNode *NGramNode::GetNGram(const Factor *factor)
{
return m_map->GetNGram(factor);
}
}

79
src/NGramNode.h Normal file
View File

@ -0,0 +1,79 @@
// $Id: NGramNode.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_NGramNode_h
#define moses_NGramNode_h
#include "Factor.h"
namespace Moses
{
class NGramCollection;
class NGramNode
{
protected:
float m_score, m_logBackOff;
NGramCollection *m_map;
NGramNode *m_rootNGram;
public:
NGramNode();
~NGramNode();
NGramCollection *GetNGramColl()
{
return m_map;
}
const NGramNode *GetNGram(const Factor *factor) const;
NGramNode *GetNGram(const Factor *factor);
const NGramNode *GetRootNGram() const
{
return m_rootNGram;
}
void SetRootNGram(NGramNode *rootNGram)
{
m_rootNGram = rootNGram;
}
float GetScore() const
{
return m_score;
}
float GetLogBackOff() const
{
return m_logBackOff;
}
void SetScore(float score)
{
m_score = score;
}
void SetLogBackOff(float logBackOff)
{
m_logBackOff = logBackOff;
}
};
}
#endif

127
src/ObjectPool.h Normal file
View File

@ -0,0 +1,127 @@
// $Id: ObjectPool.h 2939 2010-02-24 11:15:44Z jfouet $
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef moses_ObjectPool_h
#define moses_ObjectPool_h
#include <vector>
#include <deque>
#include <string>
#include <iostream>
#include <iterator>
#include "Util.h"
/***
* template class for pool of objects
* - useful if many small objects are frequently created and destroyed
* - allocates memory for N objects at a time
* - separates memory allocation from constructor/destructor calls
* - prevents memory leaks
*/
template<typename T> class ObjectPool {
public:
typedef T Object;
private:
std::string name;
size_t idx,dIdx,N;
std::vector<Object*> data;
std::vector<size_t> dataSize;
std::deque<Object*> freeObj;
int mode;
public:
static const int cleanUpOnDestruction=1;
static const int hasTrivialDestructor=2;
// constructor arguments:
// N: initial number of objects to allocate memory at a time
// m & cleanUpOnDestruction = clean up objects in destructor
// m & hasTrivialDestructor = the object type has a trivial destructor,
// i.e. no sub-object uses dynamically allocated memory
// note: not equivalent to empty destructor
// -> more efficient (destructor calls can be omitted),
// note: looks like memory leak, but is not
ObjectPool(std::string name_="T",size_t N_=100000,int m=cleanUpOnDestruction)
: name(name_),idx(0),dIdx(0),N(N_),mode(m) {allocate();}
// main accesss functions:
// get pointer to object via default or copy constructor
Object* get() {return new (getPtr()) Object;}
Object* get(const Object& x) {return new (getPtr()) Object(x);}
// get pointer to uninitialized memory,
// WARNING: use only if you know what you are doing !
// useful for non-default constructors, you have to use placement new
Object* getPtr() {
if(freeObj.size()) {
Object* rv=freeObj.back();freeObj.pop_back();rv->~Object();return rv;}
if(idx==dataSize[dIdx]) {idx=0; if(++dIdx==data.size()) allocate();}
return data[dIdx]+idx++;
}
// return object(s) to pool for reuse
// note: objects are not destroyed here, but in 'getPtr'/'destroyObjects',
// otherwise 'destroyObjects' would have to check the freeObj-stack
// before each destructor call
void freeObject(Object* x) {freeObj.push_back(x);}
template<class fwiter> void freeObjects(fwiter b,fwiter e) {
for(;b!=e;++b) this->free(*b);}
// destroy all objects, but do not free memory
void reset() {destroyObjects();idx=0;dIdx=0;freeObj.clear();}
// destroy all objects and free memory
void cleanUp() {
reset(); for(size_t i=0;i<data.size();++i) free(data[i]);
data.clear();dataSize.clear();
}
~ObjectPool() {if(mode & cleanUpOnDestruction) cleanUp();}
void printInfo(std::ostream& out) const {
out<<"OPOOL ("<<name<<") info: "<<data.size()<<" "<<dataSize.size()<<" "
<<freeObj.size()<<"\n"<<idx<<" "<<dIdx<<" "<<N<<"\n";
std::copy(dataSize.begin(),dataSize.end(),
std::ostream_iterator<size_t>(out," "));
out<<"\n\n";
}
private:
void destroyObjects() {
if(mode & hasTrivialDestructor) return;
for(size_t i=0;i<=dIdx;++i) {
size_t lastJ= (i<dIdx ? dataSize[i] : idx);
for(size_t j=0;j<lastJ;++j) (data[i]+j)->~Object();}
}
// allocate memory for a N objects, for follow-up allocations,
// the block size is doubled every time
// if allocation fails, block size is reduced by 1/4
void allocate() {
try {
if(dataSize.empty()) dataSize.push_back(N);
else dataSize.push_back(dataSize.back()*2);
void *m=malloc(sizeof(Object)*dataSize.back());
while(!m) {
dataSize.back()=static_cast<size_t>(dataSize.back()*0.75);
m=malloc(sizeof(Object)*dataSize.back());
}
data.push_back(static_cast<Object*>(m));
}
catch (const std::exception& e) {
TRACE_ERR("caught std::exception: "<<e.what()
<<" in ObjectPool::allocate(), name: "<<name<<", last size: "
<<dataSize.back()<<"\n");
TRACE_ERR("OPOOL info: "<<data.size()<<" "<<dataSize.size()<<" "
<<freeObj.size()<<"\n"<<idx<<" "<<dIdx<<" "<<N<<"\n");
std::copy(dataSize.begin(),dataSize.end(),
std::ostream_iterator<size_t>(std::cerr," "));
TRACE_ERR("\n");
throw;
}
}
};
#endif

138
src/PCNTools.cpp Normal file
View File

@ -0,0 +1,138 @@
#include "PCNTools.h"
#include <iostream>
#include <cstdlib>
namespace PCN
{
const std::string chars = "'\\";
const char& quote = chars[0];
const char& slash = chars[1];
// safe get
inline char get(const std::string& in, int c) {
if (c < 0 || c >= (int)in.size()) return 0;
else return in[(size_t)c];
}
// consume whitespace
inline void eatws(const std::string& in, int& c) {
while (get(in,c) == ' ') { c++; }
}
// from 'foo' return foo
std::string getEscapedString(const std::string& in, int &c)
{
eatws(in,c);
if (get(in,c++) != quote) return "ERROR";
std::string res;
char cur = 0;
do {
cur = get(in,c++);
if (cur == slash) { res += get(in,c++); }
else if (cur != quote) { res += cur; }
} while (get(in,c) != quote && (c < (int)in.size()));
c++;
eatws(in,c);
return res;
}
// basically atof
float getFloat(const std::string& in, int &c)
{
std::string tmp;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
tmp += get(in,c++);
}
eatws(in,c);
return atof(tmp.c_str());
}
// basically atof
int getInt(const std::string& in, int &c)
{
std::string tmp;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
tmp += get(in,c++);
}
eatws(in,c);
return atoi(tmp.c_str());
}
// parse ('foo', 0.23)
CNAlt getCNAlt(const std::string& in, int &c)
{
if (get(in,c++) != '(') { std::cerr << "PCN/PLF parse error: expected ( at start of cn alt block\n"; return CNAlt(); } // throw "expected (";
std::string word = getEscapedString(in,c);
if (get(in,c++) != ',') { std::cerr << "PCN/PLF parse error: expected , after string\n"; return CNAlt(); } // throw "expected , after string";
size_t cnNext = 1;
std::vector<float> probs;
probs.push_back(getFloat(in,c));
while (get(in,c) == ',') {
c++;
float val = getFloat(in,c);
probs.push_back(val);
}
//if we read more than one prob, this was a lattice, last item was column increment
if (probs.size()>1) {
cnNext = static_cast<size_t>(probs.back());
probs.pop_back();
if (cnNext < 1) { ; std::cerr << "PCN/PLF parse error: bad link length at last element of cn alt block\n"; return CNAlt(); } //throw "bad link length"
}
if (get(in,c++) != ')') { std::cerr << "PCN/PLF parse error: expected ) at end of cn alt block\n"; return CNAlt(); } // throw "expected )";
eatws(in,c);
return CNAlt(std::pair<std::string, std::vector<float> >(word,probs), cnNext);
}
// parse (('foo', 0.23), ('bar', 0.77))
CNCol getCNCol(const std::string& in, int &c) {
CNCol res;
if (get(in,c++) != '(') return res; // error
eatws(in,c);
while (1) {
if (c > (int)in.size()) { break; }
if (get(in,c) == ')') {
c++;
eatws(in,c);
break;
}
if (get(in,c) == ',' && get(in,c+1) == ')') {
c+=2;
eatws(in,c);
break;
}
if (get(in,c) == ',') { c++; eatws(in,c); }
res.push_back(getCNAlt(in, c));
}
return res;
}
// parse ((('foo', 0.23), ('bar', 0.77)), (('a', 0.3), ('c', 0.7)))
CN parsePCN(const std::string& in)
{
CN res;
int c = 0;
if (in[c++] != '(') return res; // error
while (1) {
if (c > (int)in.size()) { break; }
if (get(in,c) == ')') {
c++;
eatws(in,c);
break;
}
if (get(in,c) == ',' && get(in,c+1) == ')') {
c+=2;
eatws(in,c);
break;
}
if (get(in,c) == ',') { c++; eatws(in,c); }
res.push_back(getCNCol(in, c));
}
return res;
}
}

46
src/PCNTools.h Normal file
View File

@ -0,0 +1,46 @@
// $Id: StaticData.h 992 2006-11-21 23:06:30Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_PCNTools
#define moses_PCNTools
#include <vector>
#include <string>
#include <utility>
#include <cstdlib>
/** A couple of utilities to read .pcn files. A python-compatible format
* for encoding confusion networks.
*/
namespace PCN {
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;
/** Given a string ((('foo',0.1),('bar',0.9)),...) representation of a
* confusion net in PCN format, return a CN object
*/
CN parsePCN(const std::string& in);
};
#endif

546
src/PDTAimp.h Normal file
View File

@ -0,0 +1,546 @@
// $Id: PDTAimp.h 2939 2010-02-24 11:15:44Z jfouet $
// vim:tabstop=2
#ifndef moses_PDTAimp_h
#define moses_PDTAimp_h
#include "StaticData.h" // needed for factor splitter
namespace Moses
{
inline bool existsFile(const char* filePath) {
struct stat mystat;
return (stat(filePath,&mystat)==0);
}
double addLogScale(double x,double y)
{
if(x>y) return addLogScale(y,x); else return x+log(1.0+exp(y-x));
}
double Exp(double x)
{
return exp(x);
}
class PDTAimp
{
// only these classes are allowed to instantiate this class
friend class PhraseDictionaryTreeAdaptor;
protected:
PDTAimp(PhraseDictionaryTreeAdaptor *p,unsigned nis)
: m_languageModels(0),m_weightWP(0.0),m_dict(0),
m_obj(p),useCache(1),m_numInputScores(nis),totalE(0),distinctE(0) {}
public:
std::vector<float> m_weights;
LMList const* m_languageModels;
float m_weightWP;
std::vector<FactorType> m_input,m_output;
PhraseDictionaryTree *m_dict;
typedef std::vector<TargetPhraseCollection const*> vTPC;
mutable vTPC m_tgtColls;
typedef std::map<Phrase,TargetPhraseCollection const*> MapSrc2Tgt;
mutable MapSrc2Tgt m_cache;
PhraseDictionaryTreeAdaptor *m_obj;
int useCache;
std::vector<vTPC> m_rangeCache;
unsigned m_numInputScores;
UniqueObjectManager<Phrase> uniqSrcPhr;
size_t totalE,distinctE;
std::vector<size_t> path1Best,pathExplored;
std::vector<double> pathCN;
~PDTAimp()
{
CleanUp();
delete m_dict;
if (StaticData::Instance().GetVerboseLevel() >= 2)
{
TRACE_ERR("tgt candidates stats: total="<<totalE<<"; distinct="
<<distinctE<<" ("<<distinctE/(0.01*totalE)<<"); duplicates="
<<totalE-distinctE<<" ("<<(totalE-distinctE)/(0.01*totalE)
<<")\n");
TRACE_ERR("\npath statistics\n");
if(path1Best.size())
{
TRACE_ERR("1-best: ");
std::copy(path1Best.begin()+1,path1Best.end(),
std::ostream_iterator<size_t>(std::cerr," \t"));
TRACE_ERR("\n");
}
if(pathCN.size())
{
TRACE_ERR("CN (full): ");
std::transform(pathCN.begin()+1
,pathCN.end()
,std::ostream_iterator<double>(std::cerr," \t")
,Exp);
TRACE_ERR("\n");
}
if(pathExplored.size())
{
TRACE_ERR("CN (explored): ");
std::copy(pathExplored.begin()+1,pathExplored.end(),
std::ostream_iterator<size_t>(std::cerr," \t"));
TRACE_ERR("\n");
}
}
}
void Factors2String(Word const& w,std::string& s) const
{
s=w.GetString(m_input,false);
}
void CleanUp()
{
assert(m_dict);
m_dict->FreeMemory();
for(size_t i=0;i<m_tgtColls.size();++i) delete m_tgtColls[i];
m_tgtColls.clear();
m_cache.clear();
m_rangeCache.clear();
uniqSrcPhr.clear();
}
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
{
cerr << "AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)" << endl;
assert(GetTargetPhraseCollection(source)==0);
VERBOSE(2, "adding unk source phrase "<<source<<"\n");
std::pair<MapSrc2Tgt::iterator,bool> p
=m_cache.insert(std::make_pair(source,static_cast<TargetPhraseCollection const*>(0)));
if(p.second || p.first->second==0)
{
TargetPhraseCollection *ptr=new TargetPhraseCollection;
ptr->Add(new TargetPhrase(targetPhrase));
p.first->second=ptr;
m_tgtColls.push_back(ptr);
}
else VERBOSE(2, "WARNING: you added an already existing phrase!\n");
}
TargetPhraseCollection const*
GetTargetPhraseCollection(Phrase const &src) const
{
assert(m_dict);
if(src.GetSize()==0) return 0;
std::pair<MapSrc2Tgt::iterator,bool> piter;
if(useCache)
{
piter=m_cache.insert(std::make_pair(src,static_cast<TargetPhraseCollection const*>(0)));
if(!piter.second) return piter.first->second;
}
else if (m_cache.size())
{
MapSrc2Tgt::const_iterator i=m_cache.find(src);
return (i!=m_cache.end() ? i->second : 0);
}
std::vector<std::string> srcString(src.GetSize());
// convert source Phrase into vector of strings
for(size_t i=0;i<srcString.size();++i)
{
Factors2String(src.GetWord(i),srcString[i]);
}
// get target phrases in string representation
std::vector<StringTgtCand> cands;
std::vector<StringWordAlignmentCand> swacands;
std::vector<StringWordAlignmentCand> twacands;
// m_dict->GetTargetCandidates(srcString,cands);
m_dict->GetTargetCandidates(srcString,cands,swacands,twacands);
if(cands.empty())
{
return 0;
}
std::vector<TargetPhrase> tCands;tCands.reserve(cands.size());
std::vector<std::pair<float,size_t> > costs;costs.reserve(cands.size());
// convert into TargetPhrases
for(size_t i=0;i<cands.size();++i)
{
TargetPhrase targetPhrase(Output);
StringTgtCand::first_type const& factorStrings=cands[i].first;
StringTgtCand::second_type const& probVector=cands[i].second;
//StringWordAlignmentCand::second_type const& swaVector=swacands[i].second;
//StringWordAlignmentCand::second_type const& twaVector=twacands[i].second;
std::vector<float> scoreVector(probVector.size());
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
FloorScore);
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,&src);
//CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,swaVector,twaVector,&src);
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
tCands.push_back(targetPhrase);
}
TargetPhraseCollection *rv;
rv=PruneTargetCandidates(tCands,costs);
if(rv->IsEmpty())
{
delete rv;
return 0;
}
else
{
if(useCache) piter.first->second=rv;
m_tgtColls.push_back(rv);
return rv;
}
}
void Create(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
, const std::vector<float> &weight
, const LMList &languageModels
, float weightWP
)
{
// set my members
m_dict=new PhraseDictionaryTree(weight.size()-m_numInputScores);
m_input=input;
m_output=output;
m_languageModels=&languageModels;
m_weightWP=weightWP;
m_weights=weight;
std::string binFname=filePath+".binphr.idx";
if(!existsFile(binFname.c_str())) {
TRACE_ERR( "bin ttable does not exist -> create it\n");
InputFileStream in(filePath);
m_dict->Create(in,filePath);
}
TRACE_ERR( "reading bin ttable\n");
// m_dict->Read(filePath);
bool res=m_dict->Read(filePath);
if (!res) {
stringstream strme;
strme << "bin ttable was read in a wrong way\n";
UserMessage::Add(strme.str());
exit(1);
}
}
typedef PhraseDictionaryTree::PrefixPtr PPtr;
typedef unsigned short Position;
typedef std::pair<Position,Position> Range;
struct State {
PPtr ptr;
Range range;
std::vector<float> scores;
Phrase src;
State() : range(0,0),scores(0),src(Input) {}
State(Position b,Position e,const PPtr& v,const std::vector<float>& sv=std::vector<float>(0))
: ptr(v),range(b,e),scores(sv),src(Input) {}
State(Range const& r,const PPtr& v,const std::vector<float>& sv=std::vector<float>(0))
: ptr(v),range(r),scores(sv),src(Input) {}
Position begin() const {return range.first;}
Position end() const {return range.second;}
std::vector<float> GetScores() const {return scores;}
friend std::ostream& operator<<(std::ostream& out,State const& s) {
out<<" R=("<<s.begin()<<","<<s.end()<<"),";
for(std::vector<float>::const_iterator scoreIterator = s.GetScores().begin();scoreIterator<s.GetScores().end();scoreIterator++) {
out<<", "<<*scoreIterator;
}
out<<")";
return out;
}
};
void CreateTargetPhrase(TargetPhrase& targetPhrase,
StringTgtCand::first_type const& factorStrings,
StringTgtCand::second_type const& scoreVector,
Phrase const* srcPtr=0) const
{
FactorCollection &factorCollection = FactorCollection::Instance();
for(size_t k=0;k<factorStrings.size();++k)
{
std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter());
Word& w=targetPhrase.AddWord();
for(size_t l=0;l<m_output.size();++l)
w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]);
}
targetPhrase.SetScore(m_obj->GetFeature(), scoreVector, m_weights, m_weightWP, *m_languageModels);
targetPhrase.SetSourcePhrase(srcPtr);
// targetPhrase.CreateAlignmentInfo("???", "???", 44);
}
TargetPhraseCollection* PruneTargetCandidates(std::vector<TargetPhrase> const & tCands,
std::vector<std::pair<float,size_t> >& costs) const
{
// convert into TargetPhraseCollection
TargetPhraseCollection *rv=new TargetPhraseCollection;
// set limit to tableLimit or actual size, whatever is smaller
std::vector<std::pair<float,size_t> >::iterator nth =
costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit
m_obj->m_tableLimit < costs.size()) ?
m_obj->m_tableLimit : costs.size());
// find the nth phrase according to future cost
std::nth_element(costs.begin(),nth ,costs.end());
// add n top phrases to the return list
for(std::vector<std::pair<float,size_t> >::iterator
it = costs.begin(); it != nth; ++it)
rv->Add(new TargetPhrase(tCands[it->second]));
return rv;
}
// POD for target phrase scores
struct TScores {
float total;
StringTgtCand::second_type trans;
Phrase const* src;
TScores() : total(0.0),src(0) {}
};
void CacheSource(ConfusionNet const& src)
{
assert(m_dict);
const size_t srcSize=src.GetSize();
std::vector<size_t> exploredPaths(srcSize+1,0);
std::vector<double> exPathsD(srcSize+1,-1.0);
// collect some statistics
std::vector<size_t> cnDepths(srcSize,0);
for(size_t i=0;i<srcSize;++i) cnDepths[i]=src[i].size();
for(size_t len=1;len<=srcSize;++len)
for(size_t i=0;i<=srcSize-len;++i)
{
double pd=0.0; for(size_t k=i;k<i+len;++k) pd+=log(1.0*cnDepths[k]);
exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd);
}
// update global statistics
if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0);
for(size_t len=1;len<=srcSize;++len)
pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len];
if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0);
for(size_t len=1;len<=srcSize;++len) path1Best[len]+=srcSize-len+1;
if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size())
{
TRACE_ERR("path stats for current CN: \nCN (full): ");
std::transform(exPathsD.begin()+1
,exPathsD.end()
,std::ostream_iterator<double>(std::cerr," ")
,Exp);
TRACE_ERR("\n");
}
typedef StringTgtCand::first_type sPhrase;
typedef std::map<StringTgtCand::first_type,TScores> E2Costs;
std::map<Range,E2Costs> cov2cand;
std::vector<State> stack;
for(Position i=0 ; i < srcSize ; ++i)
stack.push_back(State(i, i, m_dict->GetRoot(), std::vector<float>(m_numInputScores,0.0)));
while(!stack.empty())
{
State curr(stack.back());
stack.pop_back();
assert(curr.end()<srcSize);
const ConfusionNet::Column &currCol=src[curr.end()];
// in a given column, loop over all possibilities
for(size_t colidx=0;colidx<currCol.size();++colidx)
{
const Word& w=currCol[colidx].first; // w=the i^th possibility in column colidx
std::string s;
Factors2String(w,s);
bool isEpsilon=(s=="" || s==EPSILON);
//assert that we have the right number of link params in this CN option
assert(currCol[colidx].second.size() >= m_numInputScores);
// do not start with epsilon (except at first position)
if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue;
// At a given node in the prefix tree, look to see if w defines an edge to
// another node (Extend). Stay at the same node if w==EPSILON
PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s));
if(nextP) // w is a word that should be considered
{
Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
//add together the link scores from the current state and the new arc
float inputScoreSum = 0;
std::vector<float> newInputScores(m_numInputScores,0.0);
if (m_numInputScores) {
std::transform(currCol[colidx].second.begin(), currCol[colidx].second.end(),
curr.GetScores().begin(),
newInputScores.begin(),
std::plus<float>());
//we need to sum up link weights (excluding realWordCount, which isn't in numLinkParams)
//if the sum is too low, then we won't expand this.
//TODO: dodgy! shouldn't we consider weights here? what about zero-weight params?
inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0);
}
Phrase newSrc(curr.src);
if(!isEpsilon) newSrc.AddWord(w);
if(newRange.second<srcSize && inputScoreSum>LOWEST_SCORE)
{
// if there is more room to grow, add a new state onto the queue
// to be explored that represents [begin, curEnd+)
stack.push_back(State(newRange,nextP,newInputScores));
stack.back().src=newSrc;
}
std::vector<StringTgtCand> tcands;
// now, look up the target candidates (aprx. TargetPhraseCollection) for
// the current path through the CN
m_dict->GetTargetCandidates(nextP,tcands);
if(newRange.second>=exploredPaths.size()+newRange.first)
exploredPaths.resize(newRange.second-newRange.first+1,0);
++exploredPaths[newRange.second-newRange.first];
totalE+=tcands.size();
if(tcands.size())
{
E2Costs& e2costs=cov2cand[newRange];
Phrase const* srcPtr=uniqSrcPhr(newSrc);
for(size_t i=0;i<tcands.size();++i)
{
//put input scores in first - already logged, just drop in directly
std::vector<float> nscores(newInputScores);
//resize to include phrase table scores
nscores.resize(m_numInputScores+tcands[i].second.size(),0.0f);
//put in phrase table scores, logging as we insert
std::transform(tcands[i].second.begin(),tcands[i].second.end(),nscores.begin() + m_numInputScores,TransformScore);
assert(nscores.size()==m_weights.size());
//tally up
float score=std::inner_product(nscores.begin(), nscores.end(), m_weights.begin(), 0.0f);
//count word penalty
score-=tcands[i].first.size() * m_weightWP;
std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].first,TScores()));
if(p.second) ++distinctE;
TScores & scores=p.first->second;
if(p.second || scores.total<score)
{
scores.total=score;
scores.trans=nscores;
scores.src=srcPtr;
}
}
}
}
}
} // end while(!stack.empty())
if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size())
{
TRACE_ERR("CN (explored): ");
std::copy(exploredPaths.begin()+1,exploredPaths.end(),
std::ostream_iterator<size_t>(std::cerr," "));
TRACE_ERR("\n");
}
if(pathExplored.size()<exploredPaths.size())
pathExplored.resize(exploredPaths.size(),0);
for(size_t len=1;len<=srcSize;++len)
pathExplored[len]+=exploredPaths[len];
m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize(),0));
for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin();i!=cov2cand.end();++i)
{
assert(i->first.first<m_rangeCache.size());
assert(i->first.second>0);
assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size());
assert(m_rangeCache[i->first.first][i->first.second-1]==0);
std::vector<TargetPhrase> tCands;tCands.reserve(i->second.size());
std::vector<std::pair<float,size_t> > costs;costs.reserve(i->second.size());
for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j)
{
TScores const & scores=j->second;
TargetPhrase targetPhrase(Output);
CreateTargetPhrase(targetPhrase,j->first,scores.trans,scores.src);
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
tCands.push_back(targetPhrase);
//std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl;
}
TargetPhraseCollection *rv=PruneTargetCandidates(tCands,costs);
if(rv->IsEmpty())
delete rv;
else
{
m_rangeCache[i->first.first][i->first.second-1]=rv;
m_tgtColls.push_back(rv);
}
}
// free memory
m_dict->FreeMemory();
}
size_t GetNumInputScores() const {return m_numInputScores;}
};
}
#endif

593
src/Parameter.cpp Normal file
View File

@ -0,0 +1,593 @@
// $Id: Parameter.cpp 2855 2010-02-03 19:46:35Z abarun $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <iterator>
#include <fstream>
#include <sstream>
#include <algorithm>
#include "Parameter.h"
#include "Util.h"
#include "InputFileStream.h"
#include "UserMessage.h"
#if HAVE_CONFIG_H
#include "config.h"
#endif
using namespace std;
namespace Moses
{
/** define allowed parameters */
Parameter::Parameter()
{
AddParam("beam-threshold", "b", "threshold for threshold pruning");
AddParam("config", "f", "location of the configuration file");
AddParam("drop-unknown", "du", "drop unknown words instead of copying them");
AddParam("disable-discarding", "dd", "disable hypothesis discarding");
AddParam("factor-delimiter", "fd", "specify a different factor delimiter than the default");
AddParam("generation-file", "location and properties of the generation table");
AddParam("global-lexical-file", "gl", "discriminatively trained global lexical translation model file");
AddParam("input-factors", "list of factors in the input");
AddParam("input-file", "i", "location of the input file to be translated");
AddParam("inputtype", "text (0), confusion network (1), word lattice (2) (default = 0)");
AddParam("labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
AddParam("include-alignment-in-n-best", "include word alignment in the n-best list. default is false");
AddParam("lmodel-file", "location and properties of the language models");
AddParam("lmodel-dub", "dictionary upper bounds of language models");
AddParam("lmstats", "L", "(1/0) compute LM backoff statistics for each translation hypothesis");
AddParam("mapping", "description of decoding steps");
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
AddParam("max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
AddParam("max-phrase-length", "maximum phrase length (default 20)");
AddParam("n-best-list", "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
AddParam("n-best-factor", "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0");
AddParam("print-all-derivations", "to print all derivations in search graph");
AddParam("output-factors", "list of factors in the output");
AddParam("phrase-drop-allowed", "da", "if present, allow dropping of source words"); //da = drop any (word); see -du for comparison
AddParam("report-all-factors", "report all factors in output, not just first");
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
AddParam("report-segmentation", "t", "report phrase segmentation in the output");
AddParam("stack", "s", "maximum stack size for histogram pruning");
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
AddParam("translation-details", "T", "for each best translation hypothesis, print out details about what sourcce spans were used, dropped");
AddParam("ttable-file", "location and properties of the translation tables");
AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase");
AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
AddParam("verbose", "v", "verbosity level of the logging");
AddParam("weight-d", "d", "weight(s) for distortion (reordering components)");
AddParam("weight-generation", "g", "weight(s) for generation components");
AddParam("weight-i", "I", "weight(s) for word insertion - used for parameters from confusion network and lattice input links");
AddParam("weight-l", "lm", "weight(s) for language models");
AddParam("weight-lex", "lex", "weight for global lexical model");
AddParam("weight-t", "tm", "weights for translation model components");
AddParam("weight-w", "w", "weight for word penalty");
AddParam("weight-u", "u", "weight for unknown word penalty");
AddParam("weight-e", "e", "weight for word deletion");
AddParam("weight-file", "wf", "file containing labeled weights");
AddParam("output-factors", "list if factors in the output");
AddParam("cache-path", "?");
AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
AddParam("lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation");
AddParam("mbr-size", "number of translation candidates considered in MBR decoding (default 200)");
AddParam("mbr-scale", "scaling factor to convert log linear score probability in MBR decoding (default 1.0)");
AddParam("lmbr-thetas", "theta(s) for lattice mbr calculation");
AddParam("lmbr-pruning-factor", "average number of nodes/word wanted in pruned lattice");
AddParam("lmbr-p", "unigram precision value for lattice mbr");
AddParam("lmbr-r", "ngram precision decay value for lattice mbr");
AddParam("lattice-hypo-set", "to use lattice as hypo set during lattice MBR");
AddParam("use-persistent-cache", "cache translation options across sentences (default true)");
AddParam("persistent-cache-size", "maximum size of cache for translation options (default 10,000 input phrases)");
AddParam("recover-input-path", "r", "(conf net/word lattice only) - recover input path corresponding to the best translation");
AddParam("output-word-graph", "owg", "Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos");
AddParam("time-out", "seconds after which is interrupted (-1=no time-out, default is -1)");
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
#endif
AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)");
AddParam("constraint", "Location of the file with target sentences to produce constraining the search");
AddParam("use-alignment-info", "Use word-to-word alignment: actually it is only used to output the word-to-word alignment. Word-to-word alignments are taken from the phrase table if any. Default is false.");
AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("link-param-count", "Number of parameters on word links when using confusion networks or lattices (default = 1)");
AddParam("description", "Source language, target language, description");
/*******************************Loading DPR model**********************************************/
AddParam("DPR-file","DPR-file","Model file for the DPR model");
AddParam("weight-DPR","weight-DPR","weight for the DPR model");
AddParam("class-DPR","class-DPR","the number of orientations for the DPR model");
/*******************************Loading DPR model**********************************************/
}
Parameter::~Parameter()
{
}
/** initialize a parameter, sub of constructor */
void Parameter::AddParam(const string &paramName, const string &description)
{
m_valid[paramName] = true;
m_description[paramName] = description;
}
/** initialize a parameter (including abbreviation), sub of constructor */
void Parameter::AddParam(const string &paramName, const string &abbrevName, const string &description)
{
m_valid[paramName] = true;
m_valid[abbrevName] = true;
m_abbreviation[paramName] = abbrevName;
m_description[paramName] = description;
}
/** print descriptions of all parameters */
void Parameter::Explain() {
cerr << "Usage:" << endl;
for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++)
{
const string paramName = iterParam->first;
const string paramDescription = iterParam->second;
cerr << "\t-" << paramName;
PARAM_STRING::const_iterator iterAbbr = m_abbreviation.find( paramName );
if ( iterAbbr != m_abbreviation.end() )
cerr << " (" << iterAbbr->second << ")";
cerr << ": " << paramDescription << endl;
}
}
/** check whether an item on the command line is a switch or a value
* \param token token on the command line to checked **/
bool Parameter::isOption(const char* token) {
if (! token) return false;
std::string tokenString(token);
size_t length = tokenString.size();
if (length > 0 && tokenString.substr(0,1) != "-") return false;
if (length > 1 && tokenString.substr(1,1).find_first_not_of("0123456789") == 0) return true;
return false;
}
/** load all parameters from the configuration file and the command line switches */
bool Parameter::LoadParam(const string &filePath)
{
const char *argv[] = {"executable", "-f", filePath.c_str() };
return LoadParam(3, (char**) argv);
}
/** load all parameters from the configuration file and the command line switches */
bool Parameter::LoadParam(int argc, char* argv[])
{
// config file (-f) arg mandatory
string configPath;
if ( (configPath = FindParam("-f", argc, argv)) == ""
&& (configPath = FindParam("-config", argc, argv)) == "")
{
PrintCredit();
UserMessage::Add("No configuration file was specified. Use -config or -f");
return false;
}
else
{
if (!ReadConfigFile(configPath))
{
UserMessage::Add("Could not read "+configPath);
return false;
}
}
// overwrite parameters with values from switches
for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++)
{
const string paramName = iterParam->first;
OverwriteParam("-" + paramName, paramName, argc, argv);
}
// ... also shortcuts
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++)
{
const string paramName = iterParam->first;
const string paramShortName = iterParam->second;
OverwriteParam("-" + paramShortName, paramName, argc, argv);
}
// logging of parameters that were set in either config or switch
int verbose = 1;
if (m_setting.find("verbose") != m_setting.end() &&
m_setting["verbose"].size() > 0)
verbose = Scan<int>(m_setting["verbose"][0]);
if (verbose >= 1) { // only if verbose
TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
TRACE_ERR( "\t" << iterParam->first << ": ");
for ( size_t i = 0; i < iterParam->second.size(); i++ )
TRACE_ERR( iterParam->second[i] << " ");
TRACE_ERR( endl);
}
}
// check for illegal parameters
bool noErrorFlag = true;
for (int i = 0 ; i < argc ; i++)
{
if (isOption(argv[i]))
{
string paramSwitch = (string) argv[i];
string paramName = paramSwitch.substr(1);
if (m_valid.find(paramName) == m_valid.end())
{
UserMessage::Add("illegal switch: " + paramSwitch);
noErrorFlag = false;
}
}
}
// check if parameters make sense
return Validate() && noErrorFlag;
}
/** check that parameter settings make sense */
bool Parameter::Validate()
{
bool noErrorFlag = true;
// required parameters
if (m_setting["ttable-file"].size() == 0)
{
UserMessage::Add("No phrase translation table (ttable-file)");
noErrorFlag = false;
}
if (m_setting["lmodel-dub"].size() > 0)
{
if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size())
{
stringstream errorMsg("");
errorMsg << "Config and parameters specify "
<< static_cast<int>(m_setting["lmodel-file"].size())
<< " language model files (lmodel-file), but "
<< static_cast<int>(m_setting["lmodel-dub"].size())
<< " LM upperbounds (lmodel-dub)"
<< endl;
UserMessage::Add(errorMsg.str());
noErrorFlag = false;
}
}
if (m_setting["lmodel-file"].size() != m_setting["weight-l"].size())
{
stringstream errorMsg("");
errorMsg << "Config and parameters specify "
<< static_cast<int>(m_setting["lmodel-file"].size())
<< " language model files (lmodel-file), but "
<< static_cast<int>(m_setting["weight-l"].size())
<< " weights (weight-l)";
errorMsg << endl << "You might be giving '-lmodel-file TYPE FACTOR ORDER FILENAME' but you should be giving these four as a single argument, i.e. '-lmodel-file \"TYPE FACTOR ORDER FILENAME\"'";
UserMessage::Add(errorMsg.str());
noErrorFlag = false;
}
// do files exist?
// phrase tables
if (noErrorFlag)
{
std::vector<std::string> ext;
// standard phrase table extension (i.e. full name has to be specified)
// raw tables in either un compressed or compressed form
ext.push_back("");
ext.push_back(".gz");
// alternative file extension for binary phrase table format:
ext.push_back(".binphr.idx");
noErrorFlag = FilesExist("ttable-file", 3,ext);
}
// language model
// if (noErrorFlag)
// noErrorFlag = FilesExist("lmodel-file", 3);
// input file
if (noErrorFlag && m_setting["input-file"].size() == 1)
{
noErrorFlag = FileExists(m_setting["input-file"][0]);
}
// generation tables
if (noErrorFlag)
{
std::vector<std::string> ext;
//raw tables in either un compressed or compressed form
ext.push_back("");
ext.push_back(".gz");
noErrorFlag = FilesExist("generation-file", 3, ext);
}
// distortion
if (noErrorFlag)
{
std::vector<std::string> ext;
//raw tables in either un compressed or compressed form
ext.push_back("");
ext.push_back(".gz");
//prefix tree format
ext.push_back(".binlexr.idx");
noErrorFlag = FilesExist("distortion-file", 3, ext);
}
return noErrorFlag;
}
/** check whether a file exists */
bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex,std::vector<std::string> const& extensions)
{
typedef std::vector<std::string> StringVec;
StringVec::const_iterator iter;
PARAM_MAP::const_iterator iterParam = m_setting.find(paramName);
if (iterParam == m_setting.end())
{ // no param. therefore nothing to check
return true;
}
const StringVec &pathVec = (*iterParam).second;
for (iter = pathVec.begin() ; iter != pathVec.end() ; ++iter)
{
StringVec vec = Tokenize(*iter);
if (tokenizeIndex >= vec.size())
{
stringstream errorMsg("");
errorMsg << "Expected at least " << (tokenizeIndex+1) << " tokens per emtry in '"
<< paramName << "', but only found "
<< vec.size();
UserMessage::Add(errorMsg.str());
return false;
}
const string &pathStr = vec[tokenizeIndex];
bool fileFound=0;
for(size_t i=0;i<extensions.size() && !fileFound;++i)
{
fileFound|=FileExists(pathStr + extensions[i]);
}
if(!fileFound)
{
stringstream errorMsg("");
errorMsg << "File " << pathStr << " does not exist";
UserMessage::Add(errorMsg.str());
return false;
}
}
return true;
}
/** look for a switch in arg, update parameter */
// TODO arg parsing like this does not belong in the library, it belongs
// in moses-cmd
string Parameter::FindParam(const string &paramSwitch, int argc, char* argv[])
{
for (int i = 0 ; i < argc ; i++)
{
if (string(argv[i]) == paramSwitch)
{
if (i+1 < argc)
{
return argv[i+1];
} else {
stringstream errorMsg("");
errorMsg << "Option " << paramSwitch << " requires a parameter!";
UserMessage::Add(errorMsg.str());
// TODO return some sort of error, not the empty string
}
}
}
return "";
}
/** update parameter settings with command line switches
* \param paramSwitch (potentially short) name of switch
* \param paramName full name of parameter
* \param argc number of arguments on command line
* \param argv values of paramters on command line */
void Parameter::OverwriteParam(const string &paramSwitch, const string &paramName, int argc, char* argv[])
{
int startPos = -1;
for (int i = 0 ; i < argc ; i++)
{
if (string(argv[i]) == paramSwitch)
{
startPos = i+1;
break;
}
}
if (startPos < 0)
return;
int index = 0;
m_setting[paramName]; // defines the parameter, important for boolean switches
while (startPos < argc && (!isOption(argv[startPos])))
{
if (m_setting[paramName].size() > (size_t)index)
m_setting[paramName][index] = argv[startPos];
else
m_setting[paramName].push_back(argv[startPos]);
index++;
startPos++;
}
}
/** read parameters from a configuration file */
bool Parameter::ReadConfigFile( string filePath )
{
InputFileStream inFile(filePath);
string line, paramName;
while(getline(inFile, line))
{
// comments
size_t comPos = line.find_first_of("#");
if (comPos != string::npos)
line = line.substr(0, comPos);
// trim leading and trailing spaces/tabs
line = Trim(line);
if (line[0]=='[')
{ // new parameter
for (size_t currPos = 0 ; currPos < line.size() ; currPos++)
{
if (line[currPos] == ']')
{
paramName = line.substr(1, currPos - 1);
break;
}
}
}
else if (line != "")
{ // add value to parameter
m_setting[paramName].push_back(line);
}
}
return true;
}
struct Credit
{
string name, contact, currentPursuits, areaResponsibility;
Credit(string name, string contact, string currentPursuits, string areaResponsibility)
{
this->name = name ;
this->contact = contact ;
this->currentPursuits = currentPursuits ;
this->areaResponsibility = areaResponsibility;
}
bool operator<(const Credit &other) const
{
if (areaResponsibility.size() != 0 && other.areaResponsibility.size() ==0)
return true;
if (areaResponsibility.size() == 0 && other.areaResponsibility.size() !=0)
return false;
return name < other.name;
}
};
std::ostream& operator<<(std::ostream &os, const Credit &credit)
{
os << credit.name;
if (credit.contact != "")
os << "\n contact: " << credit.contact;
if (credit.currentPursuits != "")
os << "\n " << credit.currentPursuits;
if (credit.areaResponsibility != "")
os << "\n I'll answer question on: " << credit.areaResponsibility;
os << endl;
return os;
}
void Parameter::PrintCredit()
{
vector<Credit> everyone;
everyone.push_back(Credit("Nicola Bertoldi"
, "911"
, ""
, "scripts & other stuff"));
everyone.push_back(Credit("Ondrej Bojar"
, ""
, "czech this out!"
, ""));
everyone.push_back(Credit("Chris Callison-Burch"
, "anytime, anywhere"
, "international playboy"
, ""));
everyone.push_back(Credit("Alexandra Constantin"
, ""
, "eu sunt varza"
, ""));
everyone.push_back(Credit("Brooke Cowan"
, "brooke@csail.mit.edu"
, "if you're going to san francisco, be sure to wear a flower in your hair"
, ""));
everyone.push_back(Credit("Chris Dyer"
, "can't. i'll be out driving my mustang"
, "driving my mustang"
, ""));
everyone.push_back(Credit("Marcello Federico"
, "federico at itc at it"
, "Researcher at ITC-irst, Trento, Italy"
, "IRST language model"));
everyone.push_back(Credit("Evan Herbst"
, "Small college in upstate New York"
, ""
, ""));
everyone.push_back(Credit("Philipp Koehn"
, "only between 2 and 4am"
, ""
, "Nothing fazes this dude"));
everyone.push_back(Credit("Christine Moran"
, "weird building at MIT"
, ""
, ""));
everyone.push_back(Credit("Wade Shen"
, "via morse code"
, "buying another laptop"
, ""));
everyone.push_back(Credit("Richard Zens"
, "richard at aachen dot de"
, ""
, "ambiguous source input, confusion networks, confusing source code"));
everyone.push_back(Credit("Hieu Hoang", "http://www.hoang.co.uk/hieu/"
, "phd student at Edinburgh Uni. Original Moses developer"
, "general queries/ flames on Moses. Doing stuff on async factored translation, so anything on that as well"));
sort(everyone.begin(), everyone.end());
cerr << "Moses - A beam search decoder for phrase-based statistical machine translation models" << endl
<< "Copyright (C) 2006 University of Edinburgh" << endl << endl
<< "This library is free software; you can redistribute it and/or" << endl
<< "modify it under the terms of the GNU Lesser General Public" << endl
<< "License as published by the Free Software Foundation; either" << endl
<< "version 2.1 of the License, or (at your option) any later version." << endl << endl
<< "This library is distributed in the hope that it will be useful," << endl
<< "but WITHOUT ANY WARRANTY; without even the implied warranty of" << endl
<< "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU" << endl
<< "Lesser General Public License for more details." << endl << endl
<< "You should have received a copy of the GNU Lesser General Public" << endl
<< "License along with this library; if not, write to the Free Software" << endl
<< "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA" << endl << endl
<< "***********************************************************************" << endl << endl
<< "Built on " << __DATE__ << endl << endl
<< "CREDITS" << endl << endl;
ostream_iterator<Credit> out(cerr, "\n");
copy(everyone.begin(), everyone.end(), out);
cerr << endl << endl;
}
}

Some files were not shown because too many files have changed in this diff Show More