2010-07-18 03:23:09 +04:00
// $Id$
2010-04-12 14:15:49 +04:00
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2010 Hieu Hoang
2011-02-24 15:36:50 +03:00
2010-04-12 14:15:49 +04:00
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
2011-02-24 15:36:50 +03:00
2010-04-12 14:15:49 +04:00
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
2011-02-24 15:36:50 +03:00
2010-04-12 14:15:49 +04:00
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2010-04-08 21:16:10 +04:00
# include <algorithm>
2011-03-11 16:08:43 +03:00
# include "StaticData.h"
2010-04-08 21:16:10 +04:00
# include "ChartHypothesisCollection.h"
# include "ChartHypothesis.h"
# include "ChartManager.h"
2014-08-08 00:20:10 +04:00
# include "HypergraphOutput.h"
2014-01-13 18:37:05 +04:00
# include "util/exception.hh"
2010-04-08 21:16:10 +04:00
using namespace std ;
using namespace Moses ;
2011-03-11 16:08:43 +03:00
namespace Moses
2010-04-08 21:16:10 +04:00
{
2011-03-11 16:08:43 +03:00
ChartHypothesisCollection : : ChartHypothesisCollection ( )
2010-04-08 21:16:10 +04:00
{
2011-02-24 15:36:50 +03:00
const StaticData & staticData = StaticData : : Instance ( ) ;
2010-04-08 21:16:10 +04:00
2011-02-24 15:36:50 +03:00
m_beamWidth = staticData . GetBeamWidth ( ) ;
m_maxHypoStackSize = staticData . GetMaxHypoStackSize ( ) ;
m_nBestIsEnabled = staticData . IsNBestEnabled ( ) ;
m_bestScore = - std : : numeric_limits < float > : : infinity ( ) ;
2010-04-08 21:16:10 +04:00
}
2011-03-11 16:08:43 +03:00
ChartHypothesisCollection : : ~ ChartHypothesisCollection ( )
2010-04-08 21:16:10 +04:00
{
2011-02-24 15:36:50 +03:00
HCType : : iterator iter ;
for ( iter = m_hypos . begin ( ) ; iter ! = m_hypos . end ( ) ; + + iter ) {
2011-03-11 16:08:43 +03:00
ChartHypothesis * hypo = * iter ;
ChartHypothesis : : Delete ( hypo ) ;
2011-02-24 15:36:50 +03:00
}
2011-03-11 19:28:36 +03:00
//RemoveAllInColl(m_hypos);
2010-04-08 21:16:10 +04:00
}
2011-02-24 15:36:50 +03:00
2013-05-29 21:16:15 +04:00
/** public function to add hypothesis to this collection.
2012-07-02 23:01:07 +04:00
* Returns false if equiv hypo exists in collection , otherwise returns true .
* Takes care of update arc list for n - best list creation .
2014-08-08 00:42:30 +04:00
* Will delete hypo if it exists - once this function is call don ' t delete hypothesis .
2012-07-02 23:01:07 +04:00
* \ param hypo hypothesis to add
* \ param manager pointer back to manager
*/
2011-03-11 16:08:43 +03:00
bool ChartHypothesisCollection : : AddHypothesis ( ChartHypothesis * hypo , ChartManager & manager )
2010-04-08 21:16:10 +04:00
{
2013-09-17 08:07:42 +04:00
if ( hypo - > GetTotalScore ( ) = = - std : : numeric_limits < float > : : infinity ( ) ) {
2013-09-27 12:35:24 +04:00
manager . GetSentenceStats ( ) . AddDiscarded ( ) ;
VERBOSE ( 3 , " discarded, -inf score " < < std : : endl ) ;
ChartHypothesis : : Delete ( hypo ) ;
return false ;
2013-09-17 08:07:42 +04:00
}
2013-09-18 06:45:19 +04:00
2011-02-24 15:36:50 +03:00
if ( hypo - > GetTotalScore ( ) < m_bestScore + m_beamWidth ) {
// really bad score. don't bother adding hypo into collection
manager . GetSentenceStats ( ) . AddDiscarded ( ) ;
VERBOSE ( 3 , " discarded, too bad for stack " < < std : : endl ) ;
2011-03-11 16:08:43 +03:00
ChartHypothesis : : Delete ( hypo ) ;
2011-02-24 15:36:50 +03:00
return false ;
}
// over threshold, try to add to collection
std : : pair < HCType : : iterator , bool > addRet = Add ( hypo , manager ) ;
2011-03-09 20:55:27 +03:00
// does it have the same state as an existing hypothesis?
2011-02-24 15:36:50 +03:00
if ( addRet . second ) {
// nothing found. add to collection
return true ;
}
// equiv hypo exists, recombine with other hypo
HCType : : iterator & iterExisting = addRet . first ;
2011-03-11 16:08:43 +03:00
ChartHypothesis * hypoExisting = * iterExisting ;
2013-11-23 00:27:46 +04:00
UTIL_THROW_IF2 ( iterExisting = = m_hypos . end ( ) ,
2014-01-15 19:42:02 +04:00
" Adding a hypothesis should have returned a valid iterator " ) ;
2011-02-24 15:36:50 +03:00
//StaticData::Instance().GetSentenceStats().AddRecombination(*hypo, **iterExisting);
// found existing hypo with same target ending.
// keep the best 1
if ( hypo - > GetTotalScore ( ) > hypoExisting - > GetTotalScore ( ) ) {
// incoming hypo is better than the one we have
VERBOSE ( 3 , " better than matching hyp " < < hypoExisting - > GetId ( ) < < " , recombining, " ) ;
if ( m_nBestIsEnabled ) {
hypo - > AddArc ( hypoExisting ) ;
Detach ( iterExisting ) ;
} else {
Remove ( iterExisting ) ;
}
bool added = Add ( hypo , manager ) . second ;
if ( ! added ) {
iterExisting = m_hypos . find ( hypo ) ;
2014-01-13 22:32:22 +04:00
UTIL_THROW2 ( " Offending hypo = " < < * * iterExisting ) ;
2011-02-24 15:36:50 +03:00
}
return false ;
} else {
// already storing the best hypo. discard current hypo
VERBOSE ( 3 , " worse than matching hyp " < < hypoExisting - > GetId ( ) < < " , recombining " < < std : : endl )
if ( m_nBestIsEnabled ) {
hypoExisting - > AddArc ( hypo ) ;
2013-05-29 21:16:15 +04:00
} else {
2011-03-11 16:08:43 +03:00
ChartHypothesis : : Delete ( hypo ) ;
2011-02-24 15:36:50 +03:00
}
return false ;
}
2010-04-08 21:16:10 +04:00
}
2012-07-02 23:01:07 +04:00
/** add hypothesis to stack. Prune if necessary.
* Returns false if equiv hypo exists in collection , otherwise returns true , and the iterator that points to the place where the hypo was added
* \ param hypo hypothesis to add
* \ param manager pointer back to manager
*/
2011-03-11 16:08:43 +03:00
pair < ChartHypothesisCollection : : HCType : : iterator , bool > ChartHypothesisCollection : : Add ( ChartHypothesis * hypo , ChartManager & manager )
2010-04-08 21:16:10 +04:00
{
2011-02-24 15:36:50 +03:00
std : : pair < HCType : : iterator , bool > ret = m_hypos . insert ( hypo ) ;
if ( ret . second ) {
// equiv hypo doesn't exists
VERBOSE ( 3 , " added hyp to stack " ) ;
// Update best score, if this hypothesis is new best
if ( hypo - > GetTotalScore ( ) > m_bestScore ) {
VERBOSE ( 3 , " , best on stack " ) ;
m_bestScore = hypo - > GetTotalScore ( ) ;
}
// Prune only if stack is twice as big as needed (lazy pruning)
VERBOSE ( 3 , " , now size " < < m_hypos . size ( ) ) ;
if ( m_hypos . size ( ) > 2 * m_maxHypoStackSize - 1 ) {
PruneToSize ( manager ) ;
} else {
VERBOSE ( 3 , std : : endl ) ;
}
}
return ret ;
2010-04-08 21:16:10 +04:00
}
2013-05-29 21:16:15 +04:00
/** Remove hypothesis pointed to by iterator but DOES NOT delete the object.
2012-07-02 23:01:07 +04:00
* \ param iter iterator to delete
*/
2011-03-11 16:08:43 +03:00
void ChartHypothesisCollection : : Detach ( const HCType : : iterator & iter )
2010-04-08 21:16:10 +04:00
{
2011-02-24 15:36:50 +03:00
m_hypos . erase ( iter ) ;
2010-04-08 21:16:10 +04:00
}
2012-07-02 23:01:07 +04:00
/** destroy iterator AND hypothesis pointed to by iterator. If in an object pool, takes care of that too
*/
2011-03-11 16:08:43 +03:00
void ChartHypothesisCollection : : Remove ( const HCType : : iterator & iter )
2010-04-08 21:16:10 +04:00
{
2011-03-11 16:08:43 +03:00
ChartHypothesis * h = * iter ;
2011-02-24 15:36:50 +03:00
/*
stringstream strme ( " " ) ;
strme < < h - > GetOutputPhrase ( ) ;
string toFind = " the goal of gene scientists is " ;
size_t pos = toFind . find ( strme . str ( ) ) ;
if ( pos = = 0 )
{
cerr < < pos < < " " < < strme . str ( ) < < * h < < endl ;
cerr < < * this < < endl ;
}
*/
Detach ( iter ) ;
2011-03-11 16:08:43 +03:00
ChartHypothesis : : Delete ( h ) ;
2010-04-08 21:16:10 +04:00
}
2012-07-02 23:01:07 +04:00
/** prune number of hypo to a particular number of hypos, specified by m_maxHypoStackSize, according to score
* Don ' t prune of hypos have identical scores on the boundary , so occasionally number of hypo can remain above m_maxHypoStackSize .
2012-07-03 21:11:53 +04:00
* \ param manager reference back to manager . Used for collecting stats
2012-07-02 23:01:07 +04:00
*/
2011-03-11 16:08:43 +03:00
void ChartHypothesisCollection : : PruneToSize ( ChartManager & manager )
2010-04-08 21:16:10 +04:00
{
2014-04-09 13:33:33 +04:00
if ( m_maxHypoStackSize = = 0 ) return ; // no limit
2011-02-24 15:36:50 +03:00
if ( GetSize ( ) > m_maxHypoStackSize ) { // ok, if not over the limit
priority_queue < float > bestScores ;
// push all scores to a heap
// (but never push scores below m_bestScore+m_beamWidth)
HCType : : iterator iter = m_hypos . begin ( ) ;
float score = 0 ;
while ( iter ! = m_hypos . end ( ) ) {
2011-03-11 16:08:43 +03:00
ChartHypothesis * hypo = * iter ;
2011-02-24 15:36:50 +03:00
score = hypo - > GetTotalScore ( ) ;
if ( score > m_bestScore + m_beamWidth ) {
bestScores . push ( score ) ;
}
+ + iter ;
}
// pop the top newSize scores (and ignore them, these are the scores of hyps that will remain)
// ensure to never pop beyond heap size
size_t minNewSizeHeapSize = m_maxHypoStackSize > bestScores . size ( ) ? bestScores . size ( ) : m_maxHypoStackSize ;
for ( size_t i = 1 ; i < minNewSizeHeapSize ; i + + )
bestScores . pop ( ) ;
// and remember the threshold
float scoreThreshold = bestScores . top ( ) ;
// delete all hypos under score threshold
iter = m_hypos . begin ( ) ;
while ( iter ! = m_hypos . end ( ) ) {
2011-03-11 16:08:43 +03:00
ChartHypothesis * hypo = * iter ;
2011-02-24 15:36:50 +03:00
float score = hypo - > GetTotalScore ( ) ;
if ( score < scoreThreshold ) {
HCType : : iterator iterRemove = iter + + ;
Remove ( iterRemove ) ;
manager . GetSentenceStats ( ) . AddPruning ( ) ;
} else {
+ + iter ;
}
}
VERBOSE ( 3 , " , pruned to size " < < m_hypos . size ( ) < < endl ) ;
IFVERBOSE ( 3 ) {
TRACE_ERR ( " stack now contains: " ) ;
for ( iter = m_hypos . begin ( ) ; iter ! = m_hypos . end ( ) ; iter + + ) {
2011-03-11 16:08:43 +03:00
ChartHypothesis * hypo = * iter ;
2011-02-24 15:36:50 +03:00
TRACE_ERR ( hypo - > GetId ( ) < < " ( " < < hypo - > GetTotalScore ( ) < < " ) " ) ;
}
TRACE_ERR ( endl ) ;
}
// desperation pruning
if ( m_hypos . size ( ) > m_maxHypoStackSize * 2 ) {
2011-03-11 16:08:43 +03:00
std : : vector < ChartHypothesis * > hyposOrdered ;
2011-02-24 15:36:50 +03:00
// sort hypos
std : : copy ( m_hypos . begin ( ) , m_hypos . end ( ) , std : : inserter ( hyposOrdered , hyposOrdered . end ( ) ) ) ;
std : : sort ( hyposOrdered . begin ( ) , hyposOrdered . end ( ) , ChartHypothesisScoreOrderer ( ) ) ;
//keep only |size|. delete the rest
2011-03-11 16:08:43 +03:00
std : : vector < ChartHypothesis * > : : iterator iter ;
2011-02-24 15:36:50 +03:00
for ( iter = hyposOrdered . begin ( ) + ( m_maxHypoStackSize * 2 ) ; iter ! = hyposOrdered . end ( ) ; + + iter ) {
2011-03-11 16:08:43 +03:00
ChartHypothesis * hypo = * iter ;
2011-02-24 15:36:50 +03:00
HCType : : iterator iterFindHypo = m_hypos . find ( hypo ) ;
2013-11-23 00:27:46 +04:00
UTIL_THROW_IF2 ( iterFindHypo = = m_hypos . end ( ) ,
2014-01-15 19:42:02 +04:00
" Adding a hypothesis should have returned a valid iterator " ) ;
2013-11-19 22:52:15 +04:00
2011-02-24 15:36:50 +03:00
Remove ( iterFindHypo ) ;
}
}
}
2010-04-08 21:16:10 +04:00
}
2012-07-02 23:01:07 +04:00
//! sort hypothses by descending score. Put these hypos into a vector m_hyposOrdered to be returned by function GetSortedHypotheses()
2011-03-11 16:08:43 +03:00
void ChartHypothesisCollection : : SortHypotheses ( )
2010-04-08 21:16:10 +04:00
{
2013-11-23 00:27:46 +04:00
UTIL_THROW_IF2 ( ! m_hyposOrdered . empty ( ) , " Hypotheses already sorted " ) ;
2011-02-24 15:36:50 +03:00
if ( ! m_hypos . empty ( ) ) {
// done everything for this cell.
// sort
// put into vec
m_hyposOrdered . reserve ( m_hypos . size ( ) ) ;
std : : copy ( m_hypos . begin ( ) , m_hypos . end ( ) , back_inserter ( m_hyposOrdered ) ) ;
std : : sort ( m_hyposOrdered . begin ( ) , m_hyposOrdered . end ( ) , ChartHypothesisScoreOrderer ( ) ) ;
}
2010-04-08 21:16:10 +04:00
}
2012-07-02 23:01:07 +04:00
//! Call CleanupArcList() for each main hypo in collection
2011-03-11 16:08:43 +03:00
void ChartHypothesisCollection : : CleanupArcList ( )
2011-02-24 15:36:50 +03:00
{
HCType : : iterator iter ;
for ( iter = m_hypos . begin ( ) ; iter ! = m_hypos . end ( ) ; + + iter ) {
2011-03-11 16:08:43 +03:00
ChartHypothesis * mainHypo = * iter ;
2011-02-24 15:36:50 +03:00
mainHypo - > CleanupArcList ( ) ;
}
2010-04-08 21:16:10 +04:00
}
2011-02-24 15:36:50 +03:00
2012-07-02 23:01:07 +04:00
/** Return all hypos, and all hypos in the arclist, in order to create the output searchgraph, ie. the hypergraph. The output is the debug hypo information.
* @ todo this is a useful function . Make sure it outputs everything required , especially scores .
* \ param translationId unique , contiguous id for the input sentence
* \ param outputSearchGraphStream stream to output the info to
* \ param reachable @ todo don ' t know
*/
2014-08-08 00:20:10 +04:00
void ChartHypothesisCollection : : WriteSearchGraph ( const ChartSearchGraphWriter & writer , const std : : map < unsigned , bool > & reachable ) const
2010-04-08 21:16:10 +04:00
{
2014-08-08 00:20:10 +04:00
writer . WriteHypos ( * this , reachable ) ;
2011-02-24 15:36:50 +03:00
}
2010-04-08 21:16:10 +04:00
2011-03-11 16:08:43 +03:00
std : : ostream & operator < < ( std : : ostream & out , const ChartHypothesisCollection & coll )
2011-02-24 15:36:50 +03:00
{
HypoList : : const_iterator iterInside ;
for ( iterInside = coll . m_hyposOrdered . begin ( ) ; iterInside ! = coll . m_hyposOrdered . end ( ) ; + + iterInside ) {
2011-03-11 16:08:43 +03:00
const ChartHypothesis & hypo = * * iterInside ;
2011-02-24 15:36:50 +03:00
out < < hypo < < endl ;
}
return out ;
2010-04-08 21:16:10 +04:00
}
2011-02-24 15:36:50 +03:00
2010-04-08 21:16:10 +04:00
} // namespace