2011-02-24 16:14:42 +03:00
// $Id$
2008-06-11 14:52:57 +04:00
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2006 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-11-18 16:07:41 +04:00
# include "util/check.hh"
2008-06-11 14:52:57 +04:00
# include <algorithm>
# include <sstream>
# include <string>
# include "memory.h"
# include "FactorCollection.h"
# include "Phrase.h"
# include "StaticData.h" // GetMaxNumFactors
2013-04-23 11:36:24 +04:00
# include "util/exception.hh"
2011-10-14 20:40:30 +04:00
# include "util/string_piece.hh"
# include "util/tokenize_piece.hh"
2008-06-11 14:52:57 +04:00
using namespace std ;
2008-10-09 03:51:26 +04:00
namespace Moses
{
2008-06-11 14:52:57 +04:00
2012-10-22 21:17:46 +04:00
Phrase : : Phrase ( ) { }
2011-11-21 14:49:26 +04:00
Phrase : : Phrase ( size_t reserveSize )
2008-06-11 14:52:57 +04:00
{
2011-02-24 16:14:42 +03:00
m_words . reserve ( reserveSize ) ;
2008-06-11 14:52:57 +04:00
}
2011-11-21 14:49:26 +04:00
Phrase : : Phrase ( const vector < const Word * > & mergeWords )
2008-06-11 14:52:57 +04:00
{
2011-02-24 16:14:42 +03:00
m_words . reserve ( mergeWords . size ( ) ) ;
for ( size_t currPos = 0 ; currPos < mergeWords . size ( ) ; currPos + + ) {
AddWord ( * mergeWords [ currPos ] ) ;
}
2008-06-11 14:52:57 +04:00
}
Phrase : : ~ Phrase ( )
{
}
void Phrase : : MergeFactors ( const Phrase & copy )
{
2011-11-18 16:07:41 +04:00
CHECK ( GetSize ( ) = = copy . GetSize ( ) ) ;
2011-02-24 16:14:42 +03:00
size_t size = GetSize ( ) ;
2011-11-21 14:49:26 +04:00
const size_t maxNumFactors = MAX_NUM_FACTORS ;
2011-02-24 16:14:42 +03:00
for ( size_t currPos = 0 ; currPos < size ; currPos + + ) {
for ( unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor + + ) {
FactorType factorType = static_cast < FactorType > ( currFactor ) ;
const Factor * factor = copy . GetFactor ( currPos , factorType ) ;
if ( factor ! = NULL )
SetFactor ( currPos , factorType , factor ) ;
}
}
2008-06-11 14:52:57 +04:00
}
void Phrase : : MergeFactors ( const Phrase & copy , FactorType factorType )
{
2011-11-18 16:07:41 +04:00
CHECK ( GetSize ( ) = = copy . GetSize ( ) ) ;
2011-02-24 16:14:42 +03:00
for ( size_t currPos = 0 ; currPos < GetSize ( ) ; currPos + + )
SetFactor ( currPos , factorType , copy . GetFactor ( currPos , factorType ) ) ;
2008-06-11 14:52:57 +04:00
}
void Phrase : : MergeFactors ( const Phrase & copy , const std : : vector < FactorType > & factorVec )
{
2011-11-18 16:07:41 +04:00
CHECK ( GetSize ( ) = = copy . GetSize ( ) ) ;
2011-02-24 16:14:42 +03:00
for ( size_t currPos = 0 ; currPos < GetSize ( ) ; currPos + + )
for ( std : : vector < FactorType > : : const_iterator i = factorVec . begin ( ) ;
i ! = factorVec . end ( ) ; + + i ) {
SetFactor ( currPos , * i , copy . GetFactor ( currPos , * i ) ) ;
}
2008-06-11 14:52:57 +04:00
}
Phrase Phrase : : GetSubString ( const WordsRange & wordsRange ) const
{
2011-11-21 14:49:26 +04:00
Phrase retPhrase ( wordsRange . GetNumWordsCovered ( ) ) ;
2008-06-11 14:52:57 +04:00
2011-02-24 16:14:42 +03:00
for ( size_t currPos = wordsRange . GetStartPos ( ) ; currPos < = wordsRange . GetEndPos ( ) ; currPos + + ) {
Word & word = retPhrase . AddWord ( ) ;
word = GetWord ( currPos ) ;
}
2008-06-11 14:52:57 +04:00
2011-02-24 16:14:42 +03:00
return retPhrase ;
2008-06-11 14:52:57 +04:00
}
2011-06-11 02:45:17 +04:00
Phrase Phrase : : GetSubString ( const WordsRange & wordsRange , FactorType factorType ) const
{
2011-12-09 13:30:48 +04:00
Phrase retPhrase ( wordsRange . GetNumWordsCovered ( ) ) ;
2011-06-11 02:45:17 +04:00
for ( size_t currPos = wordsRange . GetStartPos ( ) ; currPos < = wordsRange . GetEndPos ( ) ; currPos + + )
{
const Factor * f = GetFactor ( currPos , factorType ) ;
Word & word = retPhrase . AddWord ( ) ;
word . SetFactor ( factorType , f ) ;
}
return retPhrase ;
}
2008-06-11 14:52:57 +04:00
std : : string Phrase : : GetStringRep ( const vector < FactorType > factorsToPrint ) const
{
2011-02-24 16:14:42 +03:00
stringstream strme ;
for ( size_t pos = 0 ; pos < GetSize ( ) ; pos + + ) {
strme < < GetWord ( pos ) . GetString ( factorsToPrint , ( pos ! = GetSize ( ) - 1 ) ) ;
}
2008-06-11 14:52:57 +04:00
2011-02-24 16:14:42 +03:00
return strme . str ( ) ;
2008-06-11 14:52:57 +04:00
}
Word & Phrase : : AddWord ( )
{
2011-02-24 16:14:42 +03:00
m_words . push_back ( Word ( ) ) ;
return m_words . back ( ) ;
2008-06-11 14:52:57 +04:00
}
2011-02-24 16:14:42 +03:00
void Phrase : : Append ( const Phrase & endPhrase )
{
for ( size_t i = 0 ; i < endPhrase . GetSize ( ) ; i + + ) {
AddWord ( endPhrase . GetWord ( i ) ) ;
}
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
void Phrase : : PrependWord ( const Word & newWord )
{
2011-02-24 16:14:42 +03:00
AddWord ( ) ;
// shift
for ( size_t pos = GetSize ( ) - 1 ; pos > = 1 ; - - pos ) {
const Word & word = m_words [ pos - 1 ] ;
m_words [ pos ] = word ;
}
m_words [ 0 ] = newWord ;
2010-04-08 21:16:10 +04:00
}
2011-02-24 16:14:42 +03:00
2011-10-14 20:40:30 +04:00
void Phrase : : CreateFromString ( const std : : vector < FactorType > & factorOrder , const StringPiece & phraseString , const StringPiece & factorDelimiter )
2008-06-11 14:52:57 +04:00
{
2011-02-24 16:14:42 +03:00
FactorCollection & factorCollection = FactorCollection : : Instance ( ) ;
2011-10-14 20:40:30 +04:00
for ( util : : TokenIter < util : : AnyCharacter , true > word_it ( phraseString , util : : AnyCharacter ( " \t " ) ) ; word_it ; + + word_it ) {
2011-02-24 16:14:42 +03:00
Word & word = AddWord ( ) ;
2011-10-14 20:40:30 +04:00
size_t index = 0 ;
for ( util : : TokenIter < util : : MultiCharacter , false > factor_it ( * word_it , util : : MultiCharacter ( factorDelimiter ) ) ;
factor_it & & ( index < factorOrder . size ( ) ) ;
+ + factor_it , + + index ) {
word [ factorOrder [ index ] ] = factorCollection . AddFactor ( * factor_it ) ;
2011-02-24 16:14:42 +03:00
}
2011-10-14 20:40:30 +04:00
if ( index ! = factorOrder . size ( ) ) {
TRACE_ERR ( " [ERROR] Malformed input: ' " < < * word_it < < " ' " < < std : : endl
2011-06-29 12:27:43 +04:00
< < " In ' " < < phraseString < < " ' " < < endl
2011-02-24 16:14:42 +03:00
< < " Expected input to have words composed of " < < factorOrder . size ( ) < < " factor(s) (form FAC1|FAC2|...) " < < std : : endl
2011-10-14 20:40:30 +04:00
< < " but instead received input with " < < index < < " factor(s). \n " ) ;
2011-02-24 16:14:42 +03:00
abort ( ) ;
}
}
2008-06-11 14:52:57 +04:00
}
2013-04-23 11:36:24 +04:00
class NonTerminalParseException : public util : : Exception { } ;
2010-04-08 21:16:10 +04:00
void Phrase : : CreateFromStringNewFormat ( FactorDirection direction
2011-02-24 16:14:42 +03:00
, const std : : vector < FactorType > & factorOrder
2012-10-14 20:35:58 +04:00
, const StringPiece & phraseString
2011-02-24 19:17:38 +03:00
, const std : : string & /*factorDelimiter */
2011-02-24 16:14:42 +03:00
, Word & lhs )
2010-04-08 21:16:10 +04:00
{
2011-02-24 16:14:42 +03:00
// parse
2012-10-14 20:35:58 +04:00
vector < StringPiece > annotatedWordVector ;
for ( util : : TokenIter < util : : AnyCharacter , true > it ( phraseString , " \t " ) ; it ; + + it ) {
annotatedWordVector . push_back ( * it ) ;
}
2011-02-24 16:14:42 +03:00
// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
// to
// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
2011-02-28 14:41:08 +03:00
m_words . reserve ( annotatedWordVector . size ( ) - 1 ) ;
2011-02-24 16:14:42 +03:00
for ( size_t phrasePos = 0 ; phrasePos < annotatedWordVector . size ( ) - 1 ; phrasePos + + ) {
2012-10-14 20:35:58 +04:00
StringPiece & annotatedWord = annotatedWordVector [ phrasePos ] ;
2011-02-24 16:14:42 +03:00
bool isNonTerminal ;
2012-10-17 14:07:42 +04:00
if ( annotatedWord . size ( ) > = 2 & & * annotatedWord . data ( ) = = ' [ ' & & annotatedWord . data ( ) [ annotatedWord . size ( ) - 1 ] = = ' ] ' ) {
2011-02-24 16:14:42 +03:00
// non-term
isNonTerminal = true ;
2012-10-17 14:07:42 +04:00
size_t nextPos = annotatedWord . find ( ' [ ' , 1 ) ;
2013-04-23 11:36:24 +04:00
UTIL_THROW_IF ( nextPos = = string : : npos , NonTerminalParseException , " The string " < < annotatedWord < < " was parsed as a non-terminal but does not take the form [source][target]. " ) ;
2011-11-18 16:07:41 +04:00
CHECK ( nextPos ! = string : : npos ) ;
2011-02-24 16:14:42 +03:00
if ( direction = = Input )
annotatedWord = annotatedWord . substr ( 1 , nextPos - 2 ) ;
else
annotatedWord = annotatedWord . substr ( nextPos + 1 , annotatedWord . size ( ) - nextPos - 2 ) ;
} else {
isNonTerminal = false ;
}
Word & word = AddWord ( ) ;
word . CreateFromString ( direction , factorOrder , annotatedWord , isNonTerminal ) ;
}
// lhs
2012-10-17 14:07:42 +04:00
const StringPiece & annotatedWord = annotatedWordVector . back ( ) ;
2013-04-23 11:36:24 +04:00
UTIL_THROW_IF ( annotatedWord . size ( ) < 2 | | * annotatedWord . data ( ) ! = ' [ ' | | annotatedWord . data ( ) [ annotatedWord . size ( ) - 1 ] ! = ' ] ' , NonTerminalParseException , " The last entry should be a single non-terminal but was given as " < < annotatedWord ) ;
2012-10-17 14:07:42 +04:00
lhs . CreateFromString ( direction , factorOrder , annotatedWord . substr ( 1 , annotatedWord . size ( ) - 2 ) , true ) ;
assert ( lhs . IsNonTerminal ( ) ) ;
2010-04-08 21:16:10 +04:00
}
int Phrase : : Compare ( const Phrase & other ) const
{
2008-06-11 14:52:57 +04:00
# ifdef min
# undef min
# endif
2011-02-24 16:14:42 +03:00
size_t thisSize = GetSize ( )
, compareSize = other . GetSize ( ) ;
if ( thisSize ! = compareSize ) {
return ( thisSize < compareSize ) ? - 1 : 1 ;
}
for ( size_t pos = 0 ; pos < thisSize ; pos + + ) {
const Word & thisWord = GetWord ( pos )
, & otherWord = other . GetWord ( pos ) ;
int ret = Word : : Compare ( thisWord , otherWord ) ;
if ( ret ! = 0 )
return ret ;
}
return 0 ;
2010-02-03 13:23:32 +03:00
}
2008-06-11 14:52:57 +04:00
2011-02-24 16:14:42 +03:00
2008-06-11 14:52:57 +04:00
bool Phrase : : Contains ( const vector < vector < string > > & subPhraseVector
2011-02-24 16:14:42 +03:00
, const vector < FactorType > & inputFactor ) const
2008-06-11 14:52:57 +04:00
{
2011-02-24 16:14:42 +03:00
const size_t subSize = subPhraseVector . size ( )
, thisSize = GetSize ( ) ;
if ( subSize > thisSize )
return false ;
// try to match word-for-word
for ( size_t currStartPos = 0 ; currStartPos < ( thisSize - subSize + 1 ) ; currStartPos + + ) {
bool match = true ;
for ( size_t currFactorIndex = 0 ; currFactorIndex < inputFactor . size ( ) ; currFactorIndex + + ) {
FactorType factorType = inputFactor [ currFactorIndex ] ;
for ( size_t currSubPos = 0 ; currSubPos < subSize ; currSubPos + + ) {
size_t currThisPos = currSubPos + currStartPos ;
const string & subStr = subPhraseVector [ currSubPos ] [ currFactorIndex ]
, & thisStr = GetFactor ( currThisPos , factorType ) - > GetString ( ) ;
if ( subStr ! = thisStr ) {
match = false ;
break ;
}
}
if ( ! match )
break ;
}
if ( match )
return true ;
}
return false ;
2008-06-11 14:52:57 +04:00
}
bool Phrase : : IsCompatible ( const Phrase & inputPhrase ) const
{
2011-02-24 16:14:42 +03:00
if ( inputPhrase . GetSize ( ) ! = GetSize ( ) ) {
return false ;
}
const size_t size = GetSize ( ) ;
2011-11-21 14:49:26 +04:00
const size_t maxNumFactors = MAX_NUM_FACTORS ;
2011-02-24 16:14:42 +03:00
for ( size_t currPos = 0 ; currPos < size ; currPos + + ) {
for ( unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor + + ) {
FactorType factorType = static_cast < FactorType > ( currFactor ) ;
const Factor * thisFactor = GetFactor ( currPos , factorType )
, * inputFactor = inputPhrase . GetFactor ( currPos , factorType ) ;
if ( thisFactor ! = NULL & & inputFactor ! = NULL & & thisFactor ! = inputFactor )
return false ;
}
}
return true ;
2008-06-11 14:52:57 +04:00
}
bool Phrase : : IsCompatible ( const Phrase & inputPhrase , FactorType factorType ) const
{
2011-02-24 16:14:42 +03:00
if ( inputPhrase . GetSize ( ) ! = GetSize ( ) ) {
return false ;
}
for ( size_t currPos = 0 ; currPos < GetSize ( ) ; currPos + + ) {
if ( GetFactor ( currPos , factorType ) ! = inputPhrase . GetFactor ( currPos , factorType ) )
return false ;
}
return true ;
2008-06-11 14:52:57 +04:00
}
bool Phrase : : IsCompatible ( const Phrase & inputPhrase , const std : : vector < FactorType > & factorVec ) const
{
2011-02-24 16:14:42 +03:00
if ( inputPhrase . GetSize ( ) ! = GetSize ( ) ) {
return false ;
}
for ( size_t currPos = 0 ; currPos < GetSize ( ) ; currPos + + ) {
for ( std : : vector < FactorType > : : const_iterator i = factorVec . begin ( ) ;
i ! = factorVec . end ( ) ; + + i ) {
if ( GetFactor ( currPos , * i ) ! = inputPhrase . GetFactor ( currPos , * i ) )
return false ;
}
}
return true ;
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
size_t Phrase : : GetNumTerminals ( ) const
{
2011-02-24 16:14:42 +03:00
size_t ret = 0 ;
for ( size_t pos = 0 ; pos < GetSize ( ) ; + + pos ) {
if ( ! GetWord ( pos ) . IsNonTerminal ( ) )
ret + + ;
}
return ret ;
2010-04-08 21:16:10 +04:00
}
2011-02-24 16:14:42 +03:00
2008-06-11 14:52:57 +04:00
void Phrase : : InitializeMemPool ( )
{
}
void Phrase : : FinalizeMemPool ( )
{
}
TO_STRING_BODY ( Phrase ) ;
// friend
ostream & operator < < ( ostream & out , const Phrase & phrase )
{
// out << "(size " << phrase.GetSize() << ") ";
2011-02-24 16:14:42 +03:00
for ( size_t pos = 0 ; pos < phrase . GetSize ( ) ; pos + + ) {
const Word & word = phrase . GetWord ( pos ) ;
out < < word ;
}
return out ;
2008-06-11 14:52:57 +04:00
}
2008-10-09 03:51:26 +04:00
}