2013-09-08 21:22:55 +04:00
# include <map>
2009-02-05 20:37:09 +03:00
# include "StaticData.h"
# include "WordLattice.h"
# include "PCNTools.h"
# include "Util.h"
# include "FloydWarshall.h"
2013-10-02 21:42:56 +04:00
# include "TranslationOptionCollectionLattice.h"
2013-10-02 23:02:05 +04:00
# include "TranslationOptionCollectionConfusionNet.h"
2013-06-05 03:50:24 +04:00
# include "moses/FF/InputFeature.h"
2009-02-05 20:37:09 +03:00
namespace Moses
{
2013-09-27 12:35:24 +04:00
WordLattice : : WordLattice ( )
2013-09-24 01:20:58 +04:00
{
2013-12-07 04:21:06 +04:00
UTIL_THROW_IF2 ( & InputFeature : : Instance ( ) = = NULL ,
2013-11-19 22:52:15 +04:00
" Input feature must be specified " ) ;
2013-09-24 01:20:58 +04:00
}
2009-02-05 20:37:09 +03:00
size_t WordLattice : : GetColumnIncrement ( size_t i , size_t j ) const
{
2011-02-24 16:14:42 +03:00
return next_nodes [ i ] [ j ] ;
2009-02-05 20:37:09 +03:00
}
2011-02-24 16:14:42 +03:00
void WordLattice : : Print ( std : : ostream & out ) const
{
out < < " word lattice: " < < data . size ( ) < < " \n " ;
for ( size_t i = 0 ; i < data . size ( ) ; + + i ) {
out < < i < < " -- " ;
for ( size_t j = 0 ; j < data [ i ] . size ( ) ; + + j ) {
out < < " ( " < < data [ i ] [ j ] . first . ToString ( ) < < " , " ;
2013-09-08 21:22:55 +04:00
// dense
std : : vector < float > : : const_iterator iterDense ;
for ( iterDense = data [ i ] [ j ] . second . denseScores . begin ( ) ; iterDense < data [ i ] [ j ] . second . denseScores . end ( ) ; + + iterDense ) {
out < < " , " < < * iterDense ;
}
// sparse
std : : map < StringPiece , float > : : const_iterator iterSparse ;
for ( iterSparse = data [ i ] [ j ] . second . sparseScores . begin ( ) ; iterSparse ! = data [ i ] [ j ] . second . sparseScores . end ( ) ; + + iterSparse ) {
out < < " , " < < iterSparse - > first < < " = " < < iterSparse - > second ;
2011-02-24 16:14:42 +03:00
}
2013-09-08 21:22:55 +04:00
2011-02-24 16:14:42 +03:00
out < < GetColumnIncrement ( i , j ) < < " ) " ;
}
out < < " \n " ;
}
out < < " \n \n " ;
2009-02-05 20:37:09 +03:00
}
2011-01-25 23:08:29 +03:00
int WordLattice : : InitializeFromPCNDataType ( const PCN : : CN & cn , const std : : vector < FactorType > & factorOrder , const std : : string & debug_line )
2009-02-05 20:37:09 +03:00
{
2013-06-05 03:50:24 +04:00
const StaticData & staticData = StaticData : : Instance ( ) ;
2013-12-07 04:21:06 +04:00
const InputFeature & inputFeature = InputFeature : : Instance ( ) ;
size_t numInputScores = inputFeature . GetNumInputScores ( ) ;
size_t numRealWordCount = inputFeature . GetNumRealWordsInInput ( ) ;
2013-06-05 03:50:24 +04:00
size_t maxSizePhrase = StaticData : : Instance ( ) . GetMaxPhraseLength ( ) ;
2012-12-06 00:21:33 +04:00
bool addRealWordCount = ( numRealWordCount > 0 ) ;
2011-02-24 16:14:42 +03:00
//when we have one more weight than params, we add a word count feature
data . resize ( cn . size ( ) ) ;
next_nodes . resize ( cn . size ( ) ) ;
for ( size_t i = 0 ; i < cn . size ( ) ; + + i ) {
const PCN : : CNCol & col = cn [ i ] ;
if ( col . empty ( ) ) return false ;
data [ i ] . resize ( col . size ( ) ) ;
next_nodes [ i ] . resize ( col . size ( ) ) ;
for ( size_t j = 0 ; j < col . size ( ) ; + + j ) {
const PCN : : CNAlt & alt = col [ j ] ;
//check for correct number of link parameters
2013-09-06 01:34:22 +04:00
if ( alt . m_denseFeatures . size ( ) ! = numInputScores ) {
TRACE_ERR ( " ERROR: need " < < numInputScores < < " link parameters, found " < < alt . m_denseFeatures . size ( ) < < " while reading column " < < i < < " from " < < debug_line < < " \n " ) ;
2011-02-24 16:14:42 +03:00
return false ;
}
//check each element for bounds
std : : vector < float > : : const_iterator probsIterator ;
data [ i ] [ j ] . second = std : : vector < float > ( 0 ) ;
2013-09-06 01:34:22 +04:00
for ( probsIterator = alt . m_denseFeatures . begin ( ) ; probsIterator < alt . m_denseFeatures . end ( ) ; probsIterator + + ) {
2011-02-24 16:14:42 +03:00
IFVERBOSE ( 1 ) {
if ( * probsIterator < 0.0f ) {
TRACE_ERR ( " WARN: neg probability: " < < * probsIterator < < " \n " ) ;
//*probsIterator = 0.0f;
}
if ( * probsIterator > 1.0f ) {
TRACE_ERR ( " WARN: probability > 1: " < < * probsIterator < < " \n " ) ;
//*probsIterator = 1.0f;
}
}
2013-09-08 21:22:55 +04:00
float score = std : : max ( static_cast < float > ( log ( * probsIterator ) ) , LOWEST_SCORE ) ;
ScorePair & scorePair = data [ i ] [ j ] . second ;
scorePair . denseScores . push_back ( score ) ;
2011-02-24 16:14:42 +03:00
}
//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
if ( addRealWordCount ) {
//only add count if not epsilon
2013-09-06 01:34:22 +04:00
float value = ( alt . m_word = = " " | | alt . m_word = = EPSILON ) ? 0.0f : - 1.0f ;
2013-09-08 21:22:55 +04:00
data [ i ] [ j ] . second . denseScores . push_back ( value ) ;
2011-02-24 16:14:42 +03:00
}
2013-09-06 01:34:22 +04:00
String2Word ( alt . m_word , data [ i ] [ j ] . first , factorOrder ) ;
next_nodes [ i ] [ j ] = alt . m_next ;
2011-02-24 16:14:42 +03:00
if ( next_nodes [ i ] [ j ] > maxSizePhrase ) {
TRACE_ERR ( " ERROR: Jump length " < < next_nodes [ i ] [ j ] < < " in word lattice exceeds maximum phrase length " < < maxSizePhrase < < " . \n " ) ;
TRACE_ERR ( " ERROR: Increase max-phrase-length to process this lattice. \n " ) ;
return false ;
}
}
}
if ( ! cn . empty ( ) ) {
std : : vector < std : : vector < bool > > edges ( 0 ) ;
this - > GetAsEdgeMatrix ( edges ) ;
floyd_warshall ( edges , distances ) ;
IFVERBOSE ( 2 ) {
TRACE_ERR ( " Shortest paths: \n " ) ;
for ( size_t i = 0 ; i < edges . size ( ) ; + + i ) {
for ( size_t j = 0 ; j < edges . size ( ) ; + + j ) {
int d = distances [ i ] [ j ] ;
if ( d > 99999 ) {
d = - 1 ;
}
TRACE_ERR ( " \t " < < d ) ;
}
TRACE_ERR ( " \n " ) ;
}
}
}
return ! cn . empty ( ) ;
2009-02-05 20:37:09 +03:00
}
2011-01-25 23:08:29 +03:00
int WordLattice : : Read ( std : : istream & in , const std : : vector < FactorType > & factorOrder )
{
2011-02-24 16:14:42 +03:00
Clear ( ) ;
std : : string line ;
if ( ! getline ( in , line ) ) return 0 ;
std : : map < std : : string , std : : string > meta = ProcessAndStripSGML ( line ) ;
if ( meta . find ( " id " ) ! = meta . end ( ) ) {
this - > SetTranslationId ( atol ( meta [ " id " ] . c_str ( ) ) ) ;
}
PCN : : CN cn = PCN : : parsePCN ( line ) ;
return InitializeFromPCNDataType ( cn , factorOrder , line ) ;
2011-01-25 23:08:29 +03:00
}
2009-02-05 20:37:09 +03:00
void WordLattice : : GetAsEdgeMatrix ( std : : vector < std : : vector < bool > > & edges ) const
{
edges . resize ( data . size ( ) + 1 , std : : vector < bool > ( data . size ( ) + 1 , false ) ) ;
2011-02-24 16:14:42 +03:00
for ( size_t i = 0 ; i < data . size ( ) ; + + i ) {
for ( size_t j = 0 ; j < data [ i ] . size ( ) ; + + j ) {
2009-02-05 20:37:09 +03:00
edges [ i ] [ i + next_nodes [ i ] [ j ] ] = true ;
}
}
}
int WordLattice : : ComputeDistortionDistance ( const WordsRange & prev , const WordsRange & current ) const
{
2011-02-24 16:14:42 +03:00
int result ;
if ( prev . GetStartPos ( ) = = NOT_FOUND & & current . GetStartPos ( ) = = 0 ) {
result = 0 ;
VERBOSE ( 4 , " Word lattice distortion: monotonic initial step \n " ) ;
} else if ( prev . GetEndPos ( ) + 1 = = current . GetStartPos ( ) ) {
result = 0 ;
VERBOSE ( 4 , " Word lattice distortion: monotonic step from " < < prev . GetEndPos ( ) < < " to " < < current . GetStartPos ( ) < < " \n " ) ;
} else if ( prev . GetStartPos ( ) = = NOT_FOUND ) {
result = distances [ 0 ] [ current . GetStartPos ( ) ] ;
VERBOSE ( 4 , " Word lattice distortion: initial step from 0 to " < < current . GetStartPos ( ) < < " of length " < < result < < " \n " ) ;
if ( result < 0 | | result > 99999 ) {
TRACE_ERR ( " prev: " < < prev < < " \n current: " < < current < < " \n " ) ;
TRACE_ERR ( " A: got a weird distance from 0 to " < < ( current . GetStartPos ( ) + 1 ) < < " of " < < result < < " \n " ) ;
}
} else if ( prev . GetEndPos ( ) > current . GetStartPos ( ) ) {
result = distances [ current . GetStartPos ( ) ] [ prev . GetEndPos ( ) + 1 ] ;
VERBOSE ( 4 , " Word lattice distortion: backward step from " < < ( prev . GetEndPos ( ) + 1 ) < < " to " < < current . GetStartPos ( ) < < " of length " < < result < < " \n " ) ;
if ( result < 0 | | result > 99999 ) {
TRACE_ERR ( " prev: " < < prev < < " \n current: " < < current < < " \n " ) ;
TRACE_ERR ( " B: got a weird distance from " < < current . GetStartPos ( ) < < " to " < < prev . GetEndPos ( ) + 1 < < " of " < < result < < " \n " ) ;
}
} else {
result = distances [ prev . GetEndPos ( ) + 1 ] [ current . GetStartPos ( ) ] ;
VERBOSE ( 4 , " Word lattice distortion: forward step from " < < ( prev . GetEndPos ( ) + 1 ) < < " to " < < current . GetStartPos ( ) < < " of length " < < result < < " \n " ) ;
if ( result < 0 | | result > 99999 ) {
TRACE_ERR ( " prev: " < < prev < < " \n current: " < < current < < " \n " ) ;
TRACE_ERR ( " C: got a weird distance from " < < prev . GetEndPos ( ) + 1 < < " to " < < current . GetStartPos ( ) < < " of " < < result < < " \n " ) ;
}
}
return result ;
2009-02-05 20:37:09 +03:00
}
bool WordLattice : : CanIGetFromAToB ( size_t start , size_t end ) const
{
2011-02-24 16:14:42 +03:00
// std::cerr << "CanIgetFromAToB(" << start << "," << end << ")=" << distances[start][end] << std::endl;
return distances [ start ] [ end ] < 100000 ;
2009-02-05 20:37:09 +03:00
}
2013-10-02 21:42:56 +04:00
TranslationOptionCollection *
WordLattice : : CreateTranslationOptionCollection ( ) const
{
size_t maxNoTransOptPerCoverage = StaticData : : Instance ( ) . GetMaxNoTransOptPerCoverage ( ) ;
float translationOptionThreshold = StaticData : : Instance ( ) . GetTranslationOptionThreshold ( ) ;
2013-10-03 15:03:09 +04:00
TranslationOptionCollection * rv = NULL ;
2013-10-04 16:08:14 +04:00
//rv = new TranslationOptionCollectionConfusionNet(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
2013-10-03 15:03:09 +04:00
if ( StaticData : : Instance ( ) . GetUseLegacyPT ( ) ) {
rv = new TranslationOptionCollectionConfusionNet ( * this , maxNoTransOptPerCoverage , translationOptionThreshold ) ;
}
else {
rv = new TranslationOptionCollectionLattice ( * this , maxNoTransOptPerCoverage , translationOptionThreshold ) ;
}
2013-10-04 16:08:14 +04:00
2013-11-19 22:52:15 +04:00
assert ( rv ) ;
2013-10-02 21:42:56 +04:00
return rv ;
}
2009-02-05 20:37:09 +03:00
2013-10-02 23:02:05 +04:00
2013-10-02 20:43:59 +04:00
std : : ostream & operator < < ( std : : ostream & out , const WordLattice & obj )
{
2013-10-03 14:33:48 +04:00
out < < " next_nodes= " ;
for ( size_t i = 0 ; i < obj . next_nodes . size ( ) ; + + i ) {
out < < i < < " : " ;
const std : : vector < size_t > & inner = obj . next_nodes [ i ] ;
for ( size_t j = 0 ; j < inner . size ( ) ; + + j ) {
out < < inner [ j ] < < " " ;
}
}
out < < " distances= " ;
for ( size_t i = 0 ; i < obj . distances . size ( ) ; + + i ) {
out < < i < < " : " ;
const std : : vector < int > & inner = obj . distances [ i ] ;
for ( size_t j = 0 ; j < inner . size ( ) ; + + j ) {
out < < inner [ j ] < < " " ;
}
}
return out ;
2009-02-05 20:37:09 +03:00
}
2013-10-02 20:43:59 +04:00
} // namespace