mosesdecoder/moses/ChartHypothesisCollection.cpp

315 lines
10 KiB
C++

// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 Hieu Hoang
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <algorithm>
#include "StaticData.h"
#include "ChartHypothesisCollection.h"
#include "ChartHypothesis.h"
#include "ChartManager.h"
#include "HypergraphOutput.h"
#include "util/exception.hh"
using namespace std;
using namespace Moses;
namespace Moses
{
ChartHypothesisCollection::ChartHypothesisCollection()
{
const StaticData &staticData = StaticData::Instance();
m_beamWidth = staticData.GetBeamWidth();
m_maxHypoStackSize = staticData.GetMaxHypoStackSize();
m_nBestIsEnabled = staticData.IsNBestEnabled();
m_bestScore = -std::numeric_limits<float>::infinity();
}
ChartHypothesisCollection::~ChartHypothesisCollection()
{
HCType::iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {
ChartHypothesis *hypo = *iter;
ChartHypothesis::Delete(hypo);
}
//RemoveAllInColl(m_hypos);
}
/** public function to add hypothesis to this collection.
* Returns false if equiv hypo exists in collection, otherwise returns true.
* Takes care of update arc list for n-best list creation.
* Will delete hypo if it exists - once this function is call don't delete hypothesis.
* \param hypo hypothesis to add
* \param manager pointer back to manager
*/
bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManager &manager)
{
if (hypo->GetTotalScore() == - std::numeric_limits<float>::infinity()) {
manager.GetSentenceStats().AddDiscarded();
VERBOSE(3,"discarded, -inf score" << std::endl);
ChartHypothesis::Delete(hypo);
return false;
}
if (hypo->GetTotalScore() < m_bestScore + m_beamWidth) {
// really bad score. don't bother adding hypo into collection
manager.GetSentenceStats().AddDiscarded();
VERBOSE(3,"discarded, too bad for stack" << std::endl);
ChartHypothesis::Delete(hypo);
return false;
}
// over threshold, try to add to collection
std::pair<HCType::iterator, bool> addRet = Add(hypo, manager);
// does it have the same state as an existing hypothesis?
if (addRet.second) {
// nothing found. add to collection
return true;
}
// equiv hypo exists, recombine with other hypo
HCType::iterator &iterExisting = addRet.first;
ChartHypothesis *hypoExisting = *iterExisting;
UTIL_THROW_IF2(iterExisting == m_hypos.end(),
"Adding a hypothesis should have returned a valid iterator");
//StaticData::Instance().GetSentenceStats().AddRecombination(*hypo, **iterExisting);
// found existing hypo with same target ending.
// keep the best 1
if (hypo->GetTotalScore() > hypoExisting->GetTotalScore()) {
// incoming hypo is better than the one we have
VERBOSE(3,"better than matching hyp " << hypoExisting->GetId() << ", recombining, ");
if (m_nBestIsEnabled) {
hypo->AddArc(hypoExisting);
Detach(iterExisting);
} else {
Remove(iterExisting);
}
bool added = Add(hypo, manager).second;
if (!added) {
iterExisting = m_hypos.find(hypo);
UTIL_THROW2("Offending hypo = " << **iterExisting);
}
return false;
} else {
// already storing the best hypo. discard current hypo
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo);
} else {
ChartHypothesis::Delete(hypo);
}
return false;
}
}
/** add hypothesis to stack. Prune if necessary.
* Returns false if equiv hypo exists in collection, otherwise returns true, and the iterator that points to the place where the hypo was added
* \param hypo hypothesis to add
* \param manager pointer back to manager
*/
pair<ChartHypothesisCollection::HCType::iterator, bool> ChartHypothesisCollection::Add(ChartHypothesis *hypo, ChartManager &manager)
{
std::pair<HCType::iterator, bool> ret = m_hypos.insert(hypo);
if (ret.second) {
// equiv hypo doesn't exists
VERBOSE(3,"added hyp to stack");
// Update best score, if this hypothesis is new best
if (hypo->GetTotalScore() > m_bestScore) {
VERBOSE(3,", best on stack");
m_bestScore = hypo->GetTotalScore();
}
// Prune only if stack is twice as big as needed (lazy pruning)
VERBOSE(3,", now size " << m_hypos.size());
if (m_hypos.size() > 2*m_maxHypoStackSize-1) {
PruneToSize(manager);
} else {
VERBOSE(3,std::endl);
}
}
return ret;
}
/** Remove hypothesis pointed to by iterator but DOES NOT delete the object.
* \param iter iterator to delete
*/
void ChartHypothesisCollection::Detach(const HCType::iterator &iter)
{
m_hypos.erase(iter);
}
/** destroy iterator AND hypothesis pointed to by iterator. If in an object pool, takes care of that too
*/
void ChartHypothesisCollection::Remove(const HCType::iterator &iter)
{
ChartHypothesis *h = *iter;
/*
stringstream strme("");
strme << h->GetOutputPhrase();
string toFind = "the goal of gene scientists is ";
size_t pos = toFind.find(strme.str());
if (pos == 0)
{
cerr << pos << " " << strme.str() << *h << endl;
cerr << *this << endl;
}
*/
Detach(iter);
ChartHypothesis::Delete(h);
}
/** prune number of hypo to a particular number of hypos, specified by m_maxHypoStackSize, according to score
* Don't prune of hypos have identical scores on the boundary, so occasionally number of hypo can remain above m_maxHypoStackSize.
* \param manager reference back to manager. Used for collecting stats
*/
void ChartHypothesisCollection::PruneToSize(ChartManager &manager)
{
if (m_maxHypoStackSize == 0) return; // no limit
if (GetSize() > m_maxHypoStackSize) { // ok, if not over the limit
priority_queue<float> bestScores;
// push all scores to a heap
// (but never push scores below m_bestScore+m_beamWidth)
HCType::iterator iter = m_hypos.begin();
float score = 0;
while (iter != m_hypos.end()) {
ChartHypothesis *hypo = *iter;
score = hypo->GetTotalScore();
if (score > m_bestScore+m_beamWidth) {
bestScores.push(score);
}
++iter;
}
// pop the top newSize scores (and ignore them, these are the scores of hyps that will remain)
// ensure to never pop beyond heap size
size_t minNewSizeHeapSize = m_maxHypoStackSize > bestScores.size() ? bestScores.size() : m_maxHypoStackSize;
for (size_t i = 1 ; i < minNewSizeHeapSize ; i++)
bestScores.pop();
// and remember the threshold
float scoreThreshold = bestScores.top();
// delete all hypos under score threshold
iter = m_hypos.begin();
while (iter != m_hypos.end()) {
ChartHypothesis *hypo = *iter;
float score = hypo->GetTotalScore();
if (score < scoreThreshold) {
HCType::iterator iterRemove = iter++;
Remove(iterRemove);
manager.GetSentenceStats().AddPruning();
} else {
++iter;
}
}
VERBOSE(3,", pruned to size " << m_hypos.size() << endl);
IFVERBOSE(3) {
TRACE_ERR("stack now contains: ");
for(iter = m_hypos.begin(); iter != m_hypos.end(); iter++) {
ChartHypothesis *hypo = *iter;
TRACE_ERR( hypo->GetId() << " (" << hypo->GetTotalScore() << ") ");
}
TRACE_ERR( endl);
}
// desperation pruning
if (m_hypos.size() > m_maxHypoStackSize * 2) {
std::vector<ChartHypothesis*> hyposOrdered;
// sort hypos
std::copy(m_hypos.begin(), m_hypos.end(), std::inserter(hyposOrdered, hyposOrdered.end()));
std::sort(hyposOrdered.begin(), hyposOrdered.end(), ChartHypothesisScoreOrderer());
//keep only |size|. delete the rest
std::vector<ChartHypothesis*>::iterator iter;
for (iter = hyposOrdered.begin() + (m_maxHypoStackSize * 2); iter != hyposOrdered.end(); ++iter) {
ChartHypothesis *hypo = *iter;
HCType::iterator iterFindHypo = m_hypos.find(hypo);
UTIL_THROW_IF2(iterFindHypo == m_hypos.end(),
"Adding a hypothesis should have returned a valid iterator");
Remove(iterFindHypo);
}
}
}
}
//! sort hypothses by descending score. Put these hypos into a vector m_hyposOrdered to be returned by function GetSortedHypotheses()
void ChartHypothesisCollection::SortHypotheses()
{
UTIL_THROW_IF2(!m_hyposOrdered.empty(), "Hypotheses already sorted");
if (!m_hypos.empty()) {
// done everything for this cell.
// sort
// put into vec
m_hyposOrdered.reserve(m_hypos.size());
std::copy(m_hypos.begin(), m_hypos.end(), back_inserter(m_hyposOrdered));
std::sort(m_hyposOrdered.begin(), m_hyposOrdered.end(), ChartHypothesisScoreOrderer());
}
}
//! Call CleanupArcList() for each main hypo in collection
void ChartHypothesisCollection::CleanupArcList()
{
HCType::iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {
ChartHypothesis *mainHypo = *iter;
mainHypo->CleanupArcList();
}
}
/** Return all hypos, and all hypos in the arclist, in order to create the output searchgraph, ie. the hypergraph. The output is the debug hypo information.
* @todo this is a useful function. Make sure it outputs everything required, especially scores.
* \param translationId unique, contiguous id for the input sentence
* \param outputSearchGraphStream stream to output the info to
* \param reachable @todo don't know
*/
void ChartHypothesisCollection::WriteSearchGraph(const ChartSearchGraphWriter& writer, const std::map<unsigned, bool> &reachable) const
{
writer.WriteHypos(*this,reachable);
}
std::ostream& operator<<(std::ostream &out, const ChartHypothesisCollection &coll)
{
HypoList::const_iterator iterInside;
for (iterInside = coll.m_hyposOrdered.begin(); iterInside != coll.m_hyposOrdered.end(); ++iterInside) {
const ChartHypothesis &hypo = **iterInside;
out << hypo << endl;
}
return out;
}
} // namespace