mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
initial version of reordering zones and walls, may work
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1960 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
e13e45dc63
commit
a360b71426
@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "TypeDef.h"
|
||||
#include "Phrase.h"
|
||||
#include "TargetPhraseCollection.h"
|
||||
#include "ReorderingConstraint.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -42,6 +43,7 @@ protected:
|
||||
long m_translationId; //< contiguous Id
|
||||
bool m_hasMetaData;
|
||||
long m_segId;
|
||||
ReorderingConstraint m_reorderingConstraint; /**< limits on reordering specified either by "-mp" switch or xml tags */
|
||||
|
||||
public:
|
||||
|
||||
@ -112,6 +114,12 @@ public:
|
||||
//! return substring at a particular position. Only valid for Sentence class. TODO - get rid of this fn
|
||||
virtual const Word& GetWord(size_t pos) const=0;
|
||||
|
||||
//! Returns the reordering constraints
|
||||
const ReorderingConstraint& GetReorderingConstraint() const
|
||||
{
|
||||
return m_reorderingConstraint;
|
||||
};
|
||||
|
||||
TO_STRING();
|
||||
|
||||
};
|
||||
|
@ -22,16 +22,59 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include "ReorderingConstraint.h"
|
||||
#include "InputType.h"
|
||||
#include "Word.h"
|
||||
#include "StaticData.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void ReorderingConstraint::SetWall( const InputType& sentence )
|
||||
//! allocate memory for reordering walls
|
||||
void ReorderingConstraint::InitializeWalls(size_t size)
|
||||
{
|
||||
m_size = size;
|
||||
m_wall = (bool*) malloc(sizeof(bool) * size);
|
||||
m_localWall = (bool*) malloc(sizeof(bool) * size);
|
||||
|
||||
for (size_t pos = 0 ; pos < m_size ; pos++)
|
||||
{
|
||||
m_wall[pos] = false;
|
||||
m_localWall[pos] = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//! set value at a particular position
|
||||
void ReorderingConstraint::SetWall( size_t pos, bool value )
|
||||
{
|
||||
VERBOSE(3,"SETTING reordering wall at position " << pos << std::endl);
|
||||
m_wall[pos] = value;
|
||||
m_active = true;
|
||||
}
|
||||
|
||||
//! has to be called to localized walls
|
||||
void ReorderingConstraint::FinalizeWalls()
|
||||
{
|
||||
for(size_t z = 0; z < m_zone.size(); z++ )
|
||||
{
|
||||
const size_t startZone = m_zone[z][0];
|
||||
const size_t endZone = m_zone[z][1];// note: wall after endZone is not local
|
||||
for( size_t pos = startZone; pos < endZone; pos++ )
|
||||
{
|
||||
if (m_wall[ pos ])
|
||||
{
|
||||
m_localWall[ pos ] = true;
|
||||
m_wall[ pos ] = false;
|
||||
VERBOSE(3,"SETTING local wall " << pos << std::endl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//! set walls based on "-monotone-at-punctuation" flag
|
||||
void ReorderingConstraint::SetMonotoneAtPunctuation( const Phrase &sentence )
|
||||
{
|
||||
for( size_t i=0; i<sentence.GetSize(); i++ )
|
||||
{
|
||||
const Word& word = sentence.GetWord( i );
|
||||
const Word& word = sentence.GetWord(i);
|
||||
if (word[0]->GetString() == "," ||
|
||||
word[0]->GetString() == "." ||
|
||||
word[0]->GetString() == "!" ||
|
||||
@ -40,22 +83,161 @@ void ReorderingConstraint::SetWall( const InputType& sentence )
|
||||
word[0]->GetString() == ";" ||
|
||||
word[0]->GetString() == "\"")
|
||||
{
|
||||
// std::cerr << "SETTING reordering wall at position " << i << std::endl;
|
||||
SetValue( i, true );
|
||||
// set wall before and after punc, but not at sentence start, end
|
||||
if (i>0 && i<m_size-1) SetWall( i, true );
|
||||
if (i>1) SetWall( i-1, true );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ReorderingConstraint::ContainsWall( size_t start, size_t end ) const
|
||||
//! set a reordering zone (once entered, need to finish)
|
||||
void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
|
||||
{
|
||||
for( size_t i=start; i<=end; i++ )
|
||||
VERBOSE(3,"SETTING zone " << startPos << "-" << endPos << std::endl);
|
||||
std::vector< size_t > newZone;
|
||||
newZone.push_back( startPos );
|
||||
newZone.push_back( endPos );
|
||||
m_zone.push_back( newZone );
|
||||
m_active = true;
|
||||
}
|
||||
|
||||
//! check if the current hypothesis extension violates reordering constraints
|
||||
bool ReorderingConstraint::Check( const WordsBitmap &bitmap, size_t startPos, size_t endPos ) const
|
||||
{
|
||||
// nothing to be checked, we are done
|
||||
if (! IsActive() ) return true;
|
||||
|
||||
VERBOSE(3,"CHECK " << bitmap << " " << startPos << "-" << endPos);
|
||||
|
||||
// check walls
|
||||
size_t firstGapPos = bitmap.GetFirstGapPos();
|
||||
// filling first gap -> no wall violation possible
|
||||
if (firstGapPos != startPos)
|
||||
{
|
||||
if ( GetWall( i ) ) {
|
||||
// std::cerr << "HITTING reordering wall at position " << i << std::endl;
|
||||
return true;
|
||||
// if there is a wall before the last word,
|
||||
// we created a gap while moving through wall
|
||||
// -> violation
|
||||
for( size_t pos = firstGapPos; pos < endPos; pos++ )
|
||||
{
|
||||
if( GetWall( pos ) )
|
||||
{
|
||||
VERBOSE(3," hitting wall " << pos << std::endl);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
// monotone -> no violation possible
|
||||
size_t lastPos = bitmap.GetLastPos();
|
||||
if ((lastPos == NOT_FOUND && startPos == 0) ||
|
||||
(firstGapPos > lastPos && firstGapPos == startPos))
|
||||
{
|
||||
VERBOSE(3," montone, fine." << std::endl);
|
||||
return true;
|
||||
}
|
||||
|
||||
// check zones
|
||||
for(size_t z = 0; z < m_zone.size(); z++ )
|
||||
{
|
||||
const size_t startZone = m_zone[z][0];
|
||||
const size_t endZone = m_zone[z][1];
|
||||
|
||||
// fine, if translation has not reached zone yet and phrase outside zone
|
||||
if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// already completely translated zone, no violations possible
|
||||
if (firstGapPos > endZone)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// some words are translated beyond the start
|
||||
// let's look closer if some are in the zone
|
||||
size_t numWordsInZoneTranslated = 0;
|
||||
if (lastPos >= startZone)
|
||||
{
|
||||
for(size_t pos = startZone; pos <= endZone; pos++ )
|
||||
{
|
||||
if( bitmap.GetValue( pos ) )
|
||||
{
|
||||
numWordsInZoneTranslated++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// all words in zone translated, no violation possible
|
||||
if (numWordsInZoneTranslated == endZone-startZone+1)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// flag if this is an active zone
|
||||
bool activeZone = (numWordsInZoneTranslated > 0);
|
||||
|
||||
// fine, if zone completely untranslated and phrase outside zone
|
||||
if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// violation, if phrase completely outside active zone
|
||||
if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
|
||||
VERBOSE(3," outside active zone" << std::endl);
|
||||
return false;
|
||||
}
|
||||
|
||||
// ok, this is what we know now:
|
||||
// * the phrase is in the zone (at least partially)
|
||||
// * either zone is already active, or it becomes active now
|
||||
|
||||
// let us check on phrases that are partially outside
|
||||
|
||||
// phrase overlaps at the beginning, always ok
|
||||
if (startPos <= startZone)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// phrase goes beyond end, has to fill zone completely
|
||||
if (endPos > endZone)
|
||||
{
|
||||
if (endZone-startPos+1 < // num. words filled in by phrase
|
||||
endZone-startZone+1-numWordsInZoneTranslated) // num. untranslated
|
||||
{
|
||||
VERBOSE(3," overlap end, but not completing" << std::endl);
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// now we are down to phrases that are completely inside the zone
|
||||
// we have to check local walls
|
||||
bool seenUntranslatedBeforeStartPos = false;
|
||||
for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ )
|
||||
{
|
||||
// be careful when there is a gap before phrase
|
||||
if( !bitmap.GetValue( pos ) // untranslated word
|
||||
&& pos < startPos ) // before startPos
|
||||
{
|
||||
seenUntranslatedBeforeStartPos = true;
|
||||
}
|
||||
if( seenUntranslatedBeforeStartPos && GetLocalWall( pos ) )
|
||||
{
|
||||
VERBOSE(3," local wall violation" << std::endl);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// passed all checks for this zone, on to the next one
|
||||
}
|
||||
|
||||
// passed all checks, no violations
|
||||
VERBOSE(3," fine." << std::endl);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -29,6 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include "TypeDef.h"
|
||||
#include "Word.h"
|
||||
#include "Phrase.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -42,46 +44,52 @@ class ReorderingConstraint
|
||||
protected:
|
||||
// const size_t m_size; /**< number of words in sentence */
|
||||
size_t m_size; /**< number of words in sentence */
|
||||
bool *m_bitmap; /**< flag for each word if it is a wall */
|
||||
bool *m_wall; /**< flag for each word if it is a wall */
|
||||
bool *m_localWall; /**< flag for each word if it is a local wall */
|
||||
std::vector< std::vector< size_t > > m_zone; /** zones that limit reordering */
|
||||
bool m_active; /**< flag indicating, if there are any active constraints */
|
||||
|
||||
public:
|
||||
|
||||
//! create ReorderingConstraint of length size and initialise to zero
|
||||
ReorderingConstraint(size_t size)
|
||||
:m_size (size)
|
||||
{
|
||||
m_bitmap = (bool*) malloc(sizeof(bool) * size);
|
||||
ReorderingConstraint() :m_wall(NULL),m_localWall(NULL),m_active(false) {}
|
||||
|
||||
for (size_t pos = 0 ; pos < m_size ; pos++)
|
||||
{
|
||||
m_bitmap[pos] = false;
|
||||
}
|
||||
//! destructer
|
||||
~ReorderingConstraint()
|
||||
{
|
||||
if (m_wall != NULL) free(m_wall);
|
||||
if (m_localWall != NULL) free(m_localWall);
|
||||
}
|
||||
|
||||
~ReorderingConstraint()
|
||||
{
|
||||
free(m_bitmap);
|
||||
}
|
||||
//! allocate memory for memory for a sentence of a given size
|
||||
void InitializeWalls(size_t size);
|
||||
|
||||
|
||||
//! whether a word has been translated at a particular position
|
||||
bool GetWall(size_t pos) const
|
||||
{
|
||||
return m_bitmap[pos];
|
||||
}
|
||||
//! changes walls in zones into local walls
|
||||
void FinalizeWalls();
|
||||
|
||||
//! set value at a particular position
|
||||
void SetValue( size_t pos, bool value )
|
||||
{
|
||||
m_bitmap[pos] = value;
|
||||
}
|
||||
void SetWall( size_t pos, bool value );
|
||||
|
||||
//! set the reordering wall based on the words in the sentence
|
||||
void SetWall( const InputType& sentence );
|
||||
//! whether a word has been translated at a particular position
|
||||
bool GetWall(size_t pos) const { return m_wall[pos]; }
|
||||
|
||||
//! checks if there is a wall in the interval [start,end]
|
||||
bool ContainsWall( size_t start, size_t end ) const;
|
||||
//! whether a word has been translated at a particular position
|
||||
bool GetLocalWall(size_t pos) const { return m_localWall[pos]; }
|
||||
|
||||
//! set a zone
|
||||
void SetZone( size_t startPos, size_t endPos );
|
||||
|
||||
//! returns the vector of zones
|
||||
std::vector< std::vector< size_t > > & GetZones() { return m_zone; }
|
||||
|
||||
//! set the reordering walls based on punctuation in the sentence
|
||||
void SetMonotoneAtPunctuation( const Phrase & sentence );
|
||||
|
||||
//! check if all constraints are fulfilled -> all find
|
||||
bool Check( const WordsBitmap &bitmap, size_t start, size_t end ) const;
|
||||
|
||||
//! checks if reordering constraints will be enforced
|
||||
bool IsActive() const { return m_active; }
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -62,13 +62,6 @@ SearchCubePruning::SearchCubePruning(const InputType &source, const TranslationO
|
||||
|
||||
m_hypoStackColl[ind] = sourceHypoColl;
|
||||
}
|
||||
|
||||
// set additional reordering constraints, if specified
|
||||
if (staticData.UseReorderingConstraint())
|
||||
{
|
||||
m_reorderingConstraint = new ReorderingConstraint( m_source.GetSize() );
|
||||
m_reorderingConstraint->SetWall( m_source );
|
||||
}
|
||||
}
|
||||
|
||||
SearchCubePruning::~SearchCubePruning()
|
||||
@ -239,24 +232,24 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
|
||||
// since we check for reordering limits, its good to have that limit handy
|
||||
int maxDistortion = StaticData::Instance().GetMaxDistortion();
|
||||
|
||||
// no limit of reordering: no prob
|
||||
// if there are reordering limits, make sure it is not violated
|
||||
// the coverage bitmap is handy here (and the position of the first gap)
|
||||
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
|
||||
, startPos = range.GetStartPos()
|
||||
, endPos = range.GetEndPos();
|
||||
|
||||
// if reordering constraints are used (--monotone-at-punctuation or xml), check if passes all
|
||||
if (! m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// no limit of reordering: no problem
|
||||
if (maxDistortion < 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// if there are reordering limits, make sure it is not violated
|
||||
// the coverage bitmap is handy here (and the position of the first gap)
|
||||
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
|
||||
, sourceSize = m_source.GetSize()
|
||||
, startPos = range.GetStartPos()
|
||||
, endPos = range.GetEndPos();
|
||||
|
||||
// MAIN LOOP. go through each possible hypo
|
||||
size_t maxSize = sourceSize - startPos;
|
||||
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
|
||||
maxSize = std::min(maxSize, maxSizePhrase);
|
||||
|
||||
bool leftMostEdge = (hypoFirstGapPos == startPos);
|
||||
// any length extension is okay if starting at left-most edge
|
||||
if (leftMostEdge)
|
||||
@ -264,37 +257,21 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
|
||||
return true;
|
||||
}
|
||||
// starting somewhere other than left-most edge, use caution
|
||||
else
|
||||
{
|
||||
// the basic idea is this: we would like to translate a phrase starting
|
||||
// from a position further right than the left-most open gap. The
|
||||
// distortion penalty for the following phrase will be computed relative
|
||||
// to the ending position of the current extension, so we ask now what
|
||||
// its maximum value will be (which will always be the value of the
|
||||
// hypothesis starting at the left-most edge). If this vlaue is than
|
||||
// the distortion limit, we don't allow this extension to be made.
|
||||
WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
|
||||
int required_distortion =
|
||||
m_source.ComputeDistortionDistance(range, bestNextExtension);
|
||||
|
||||
if (required_distortion > maxDistortion) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if reordering walls are used (--monotone-at-punctuation), check here if
|
||||
// there is a wall between the beginning of the gap and the end
|
||||
// of this new phrase (jumping the wall).
|
||||
|
||||
if ( StaticData::Instance().UseReorderingConstraint() ) {
|
||||
if ( m_reorderingConstraint->ContainsWall( hypoFirstGapPos, endPos ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
// the basic idea is this: we would like to translate a phrase starting
|
||||
// from a position further right than the left-most open gap. The
|
||||
// distortion penalty for the following phrase will be computed relative
|
||||
// to the ending position of the current extension, so we ask now what
|
||||
// its maximum value will be (which will always be the value of the
|
||||
// hypothesis starting at the left-most edge). If this vlaue is than
|
||||
// the distortion limit, we don't allow this extension to be made.
|
||||
WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
|
||||
int required_distortion =
|
||||
m_source.ComputeDistortionDistance(range, bestNextExtension);
|
||||
|
||||
if (required_distortion > maxDistortion) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -332,7 +309,6 @@ void SearchCubePruning::PrintBitmapContainerGraph()
|
||||
{
|
||||
cerr << iterAccessor->first << endl;
|
||||
BitmapContainer &container = *iterAccessor->second;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <vector>
|
||||
#include "Search.h"
|
||||
#include "HypothesisStackCubePruning.h"
|
||||
#include "ReorderingConstraint.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -21,7 +20,6 @@ protected:
|
||||
TargetPhrase m_initialTargetPhrase; /**< used to seed 1st hypo */
|
||||
clock_t m_start; /**< used to track time spend on translation */
|
||||
const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
|
||||
ReorderingConstraint *m_reorderingConstraint; /**< positions in input sentence over which no reordering is allowed */
|
||||
|
||||
//! go thru all bitmaps in 1 stack & create backpointers to bitmaps in the stack
|
||||
void CreateForwardTodos(HypothesisStackCubePruning &stack);
|
||||
|
@ -34,13 +34,6 @@ SearchNormal::SearchNormal(const InputType &source, const TranslationOptionColle
|
||||
|
||||
m_hypoStackColl[ind] = sourceHypoColl;
|
||||
}
|
||||
|
||||
// set additional reordering constraints, if specified
|
||||
if (staticData.UseReorderingConstraint())
|
||||
{
|
||||
m_reorderingConstraint = new ReorderingConstraint( m_source.GetSize() );
|
||||
m_reorderingConstraint->SetWall( m_source );
|
||||
}
|
||||
}
|
||||
|
||||
SearchNormal::~SearchNormal()
|
||||
@ -129,7 +122,10 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
|
||||
|
||||
for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
|
||||
{
|
||||
if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)))
|
||||
if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)) || !m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
{
|
||||
//TODO: does this method include incompatible WordLattice hypotheses?
|
||||
ExpandAllHypotheses(hypothesis
|
||||
@ -147,7 +143,7 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
|
||||
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
|
||||
, sourceSize = m_source.GetSize();
|
||||
|
||||
// MAIN LOOP. go through each possible hypo
|
||||
// MAIN LOOP. go through each possible range
|
||||
for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos)
|
||||
{
|
||||
size_t maxSize = sourceSize - startPos;
|
||||
@ -178,6 +174,12 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
|
||||
|
||||
for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
|
||||
{
|
||||
// check if passes specified reordering constraints
|
||||
// (set with -monotone-at-punctuation or xml)
|
||||
if (!m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// check for overlap
|
||||
WordsRange extRange(startPos, endPos);
|
||||
#ifdef DEBUGLATTICE
|
||||
@ -268,14 +270,8 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
|
||||
continue;
|
||||
}
|
||||
|
||||
// if reordering walls are used (--monotone-at-punctuation), check here if
|
||||
// there is a wall between the beginning of the gap and the end
|
||||
// of this new phrase (jumping the wall).
|
||||
if ( StaticData::Instance().UseReorderingConstraint() ) {
|
||||
if ( m_reorderingConstraint->ContainsWall( hypoFirstGapPos, endPos ) )
|
||||
continue;
|
||||
}
|
||||
|
||||
// everything is fine, we're good to go
|
||||
ExpandAllHypotheses(hypothesis
|
||||
,m_transOptColl.GetTranslationOptionList(extRange));
|
||||
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include "Search.h"
|
||||
#include "HypothesisStackNormal.h"
|
||||
#include "TranslationOptionCollection.h"
|
||||
#include "ReorderingConstraint.h"
|
||||
#include "Timer.h"
|
||||
|
||||
namespace Moses
|
||||
@ -25,8 +24,6 @@ protected:
|
||||
size_t interrupted_flag;
|
||||
HypothesisStackNormal* actual_hypoStack; /**actual (full expanded) stack of hypotheses*/
|
||||
const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
|
||||
ReorderingConstraint *m_reorderingConstraint; /**< positions in input sentence over which no reordering is allowed */
|
||||
|
||||
|
||||
// functions for creating hypotheses
|
||||
void ProcessOneHypothesis(const Hypothesis &hypothesis);
|
||||
|
@ -36,16 +36,19 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
|
||||
|
||||
if (getline(in, line, '\n').eof())
|
||||
return 0;
|
||||
// remove extra spaces
|
||||
line = Trim(line);
|
||||
meta = ProcessAndStripSGML(line);
|
||||
|
||||
// if sentences is specified as "<seg id=1> ... </seg>", extract id
|
||||
meta = ProcessAndStripSGML(line);
|
||||
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
|
||||
|
||||
//parse XML markup in translation line
|
||||
// parse XML markup in translation line
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
std::vector<std::vector<XmlOption*> > xmlOptionsList(0);
|
||||
std::vector< size_t > xmlWalls;
|
||||
if (staticData.GetXmlInputType() != XmlPassThrough) {
|
||||
if (!ProcessAndStripXMLTags(line, xmlOptionsList)) {
|
||||
if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls )) {
|
||||
TRACE_ERR("Unable to parse XML in line " << line);
|
||||
abort();
|
||||
}
|
||||
@ -107,6 +110,21 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
m_reorderingConstraint.InitializeWalls( GetSize() );
|
||||
|
||||
// set reordering walls, if "-monotone-at-punction" is set
|
||||
if (staticData.UseReorderingConstraint())
|
||||
{
|
||||
m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) );
|
||||
}
|
||||
|
||||
// set walls obtained from xml
|
||||
for(size_t i=0; i<xmlWalls.size(); i++)
|
||||
if( xmlWalls[i] < GetSize() ) // no buggy walls, please
|
||||
m_reorderingConstraint.SetWall( xmlWalls[i], true );
|
||||
m_reorderingConstraint.FinalizeWalls();
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -86,7 +86,6 @@ class Sentence : public Phrase, public InputType
|
||||
//! populates vector argument with XML force translation options for the specific range passed
|
||||
void GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const;
|
||||
|
||||
|
||||
int Read(std::istream& in,const std::vector<FactorType>& factorOrder);
|
||||
void Print(std::ostream& out) const;
|
||||
|
||||
|
@ -543,7 +543,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
|
||||
*/
|
||||
bool TranslationOptionCollection::HasXmlOptionsOverlappingRange(size_t, size_t) const {
|
||||
return false;
|
||||
|
||||
//not implemented for base class
|
||||
}
|
||||
|
||||
/** Populates the current Collection with XML options exactly covering the range specified. Default implementation does nothing.
|
||||
|
@ -196,7 +196,6 @@ public:
|
||||
|
||||
//! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
|
||||
WordsBitmapID GetID() const {
|
||||
std::cerr << "GetID()\n";
|
||||
assert(m_size < (1<<16));
|
||||
|
||||
size_t start = GetFirstGapPos();
|
||||
|
@ -32,225 +32,374 @@
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
|
||||
string ParseXmlTagAttribute(const string& tag,const string& attributeName){
|
||||
/*TODO deal with unescaping \"*/
|
||||
string tagOpen = attributeName + "=\"";
|
||||
size_t contentsStart = tag.find(tagOpen);
|
||||
if (contentsStart == std::string::npos) return "";
|
||||
if (contentsStart == string::npos) return "";
|
||||
contentsStart += tagOpen.size();
|
||||
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
|
||||
if (contentsEnd == std::string::npos) {
|
||||
if (contentsEnd == string::npos) {
|
||||
TRACE_ERR("Malformed XML attribute: "<< tag);
|
||||
return "";
|
||||
}
|
||||
size_t possibleEnd;
|
||||
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
|
||||
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
|
||||
contentsEnd = possibleEnd;
|
||||
}
|
||||
return tag.substr(contentsStart,contentsEnd-contentsStart);
|
||||
}
|
||||
|
||||
std::string TrimXml(const std::string& str) {
|
||||
/**
|
||||
* Remove "<" and ">" from XML tag
|
||||
*
|
||||
* \param str xml token to be stripped
|
||||
*/
|
||||
string TrimXml(const string& str)
|
||||
{
|
||||
// too short to be xml token -> do nothing
|
||||
if (str.size() < 2) return str;
|
||||
if (str[0] == '<' && str[str.size() - 1] == '>') {
|
||||
|
||||
// strip first and last character
|
||||
if (str[0] == '<' && str[str.size() - 1] == '>')
|
||||
{
|
||||
return str.substr(1, str.size() - 2);
|
||||
} else { return str; }
|
||||
}
|
||||
// not an xml token -> do nothing
|
||||
else { return str; }
|
||||
}
|
||||
|
||||
bool isXmlTag(const std::string& tag)
|
||||
/**
|
||||
* Check if the token is an XML tag, i.e. starts with "<"
|
||||
*
|
||||
* \param tag token to be checked
|
||||
*/
|
||||
bool isXmlTag(const string& tag)
|
||||
{
|
||||
return tag[0] == '<';
|
||||
}
|
||||
|
||||
inline std::vector<std::string> TokenizeXml(const std::string& str)
|
||||
/**
|
||||
* Split up the input character string into tokens made up of
|
||||
* either XML tags or text.
|
||||
* example: this <b> is a </b> test .
|
||||
* => (this ), (<b>), ( is a ), (</b>), ( test .)
|
||||
*
|
||||
* \param str input string
|
||||
*/
|
||||
inline vector<string> TokenizeXml(const string& str)
|
||||
{
|
||||
std::string lbrack = "<";
|
||||
std::string rbrack = ">";
|
||||
std::vector<std::string> tokens;
|
||||
// Find first "non-delimiter".
|
||||
std::string::size_type cpos = 0;
|
||||
std::string::size_type lpos = 0;
|
||||
std::string::size_type rpos = 0;
|
||||
string lbrack = "<";
|
||||
string rbrack = ">";
|
||||
vector<string> tokens; // vector of tokens to be returned
|
||||
string::size_type cpos = 0; // current position in string
|
||||
string::size_type lpos = 0; // left start of xml tag
|
||||
string::size_type rpos = 0; // right end of xml tag
|
||||
|
||||
while (cpos != str.size()) {
|
||||
// walk thorugh the string (loop vver cpos)
|
||||
while (cpos != str.size())
|
||||
{
|
||||
// find the next opening "<" of an xml tag
|
||||
lpos = str.find_first_of(lbrack, cpos);
|
||||
if (lpos != std::string::npos) {
|
||||
if (lpos != string::npos)
|
||||
{
|
||||
// find the end of the xml tag
|
||||
rpos = str.find_first_of(rbrack, lpos);
|
||||
if (rpos == std::string::npos) {
|
||||
// sanity check: there has to be closing ">"
|
||||
if (rpos == string::npos)
|
||||
{
|
||||
TRACE_ERR("ERROR: malformed XML: " << str << endl);
|
||||
return tokens;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else // no more tags found
|
||||
{
|
||||
// add the rest as token
|
||||
tokens.push_back(str.substr(cpos));
|
||||
break;
|
||||
}
|
||||
|
||||
// add stuff before xml tag as token, if there is any
|
||||
if (lpos - cpos > 0)
|
||||
tokens.push_back(str.substr(cpos, lpos - cpos));
|
||||
|
||||
// add xml tag as token
|
||||
tokens.push_back(str.substr(lpos, rpos-lpos+1));
|
||||
cpos = rpos + 1;
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a sentence with xml annotation
|
||||
* Xml tags may specifiy additional/replacing translation options
|
||||
* and reordering constraints
|
||||
*
|
||||
* \param line in: sentence, out: sentence without the xml
|
||||
* \param res vector with translation options specified by xml
|
||||
* \param reorderingConstraint reordering constraint zones specified by xml
|
||||
* \param walls reordering constraint walls specified by xml
|
||||
*/
|
||||
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
|
||||
is so we can link things up afterwards. We can't create TranslationOptions as we
|
||||
parse because we don't have the completed source parsed until after this function
|
||||
removes all the markup from it (CreateFromString in Sentence::Read).
|
||||
*/
|
||||
bool ProcessAndStripXMLTags(std::string &line, std::vector<std::vector<XmlOption*> > &res) {
|
||||
bool ProcessAndStripXMLTags(string &line, vector<vector<XmlOption*> > &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls ) {
|
||||
//parse XML markup in translation line
|
||||
|
||||
if (line.find_first_of('<') == std::string::npos) { return true; }
|
||||
// no xml tag? we're done.
|
||||
if (line.find_first_of('<') == string::npos) { return true; }
|
||||
|
||||
std::string rstr;
|
||||
std::vector<std::string> xmlTokens = TokenizeXml(line);
|
||||
std::string tagName = "";
|
||||
std::string tagContents = "";
|
||||
std::vector<std::string> altTexts;
|
||||
std::vector<std::string> altProbs;
|
||||
std::vector<XmlOption*> linkedOptions;
|
||||
size_t tagStart=0;
|
||||
size_t tagEnd=0;
|
||||
size_t curWord=0;
|
||||
int numUnary = 0;
|
||||
bool doClose = false;
|
||||
bool isLinked = false;
|
||||
const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
||||
const std::string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
// break up input into a vector of xml tags and text
|
||||
// example: (this), (<b>), (is a), (</b>), (test .)
|
||||
vector<string> xmlTokens = TokenizeXml(line);
|
||||
|
||||
// we need to store opened tags, until they are closed
|
||||
// tags are stored as tripled (tagname, startpos, contents)
|
||||
typedef pair< string, pair< size_t, string > > OpenedTag;
|
||||
vector< OpenedTag > tagStack; // stack that contains active opened tags
|
||||
|
||||
string cleanLine; // return string (text without xml)
|
||||
vector<XmlOption*> linkedOptions;
|
||||
size_t wordPos = 0; // position in sentence (in terms of number of words)
|
||||
bool isLinked = false;
|
||||
const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
||||
const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
|
||||
// loop through the tokens
|
||||
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
|
||||
{
|
||||
// not a xml tag, but regular text (may contain many words)
|
||||
if(!isXmlTag(xmlTokens[xmlTokenPos]))
|
||||
{
|
||||
//phrase, not tag. token may contain many words
|
||||
rstr += xmlTokens[xmlTokenPos];
|
||||
curWord = Tokenize(rstr).size();
|
||||
// add a space at boundary, if necessary
|
||||
if (cleanLine.size()>0 &&
|
||||
cleanLine[cleanLine.size() - 1] != ' ' &&
|
||||
xmlTokens[xmlTokenPos][0] != ' ')
|
||||
{
|
||||
cleanLine += " ";
|
||||
}
|
||||
cleanLine += xmlTokens[xmlTokenPos]; // add to output
|
||||
wordPos = Tokenize(cleanLine).size(); // count all the words
|
||||
}
|
||||
|
||||
// process xml tag
|
||||
else
|
||||
{
|
||||
//tag data
|
||||
std::string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
|
||||
// *** get essential information about tag ***
|
||||
|
||||
// strip extra boundary spaces and "<" and ">"
|
||||
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
|
||||
VERBOSE(3,"XML TAG IS: " << tag << std::endl);
|
||||
std::string::size_type endOfName = tag.find_first_of(' ');
|
||||
std::string nextTagName = tag;
|
||||
bool isUnary = tag[tag.size() - 1] == '/';
|
||||
bool isOpen = tag[0] != '/';
|
||||
if (endOfName != std::string::npos) {
|
||||
nextTagName = tag.substr(0,endOfName);
|
||||
tagContents = tag.substr(endOfName+1);
|
||||
}
|
||||
if (nextTagName == "linked") {
|
||||
//just set a flag, don't try to process
|
||||
if (tagName != "") {
|
||||
TRACE_ERR("ERROR: tried to start linked XML tag while \""<< tagName <<"\" tag still open: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
if (linkedOptions.size()>0) {
|
||||
TRACE_ERR("ERROR: tried to start second linked XML tag while linked still open: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
isLinked = true;
|
||||
isOpen = false;
|
||||
}
|
||||
else if (nextTagName == "/linked") {
|
||||
isLinked = false;
|
||||
//can't be in an open tag when we stop linking
|
||||
if (tagName != "") {
|
||||
TRACE_ERR("ERROR: tried to close linked XML tag while \""<< tagName <<"\" tag still open: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
res.push_back(linkedOptions);
|
||||
linkedOptions.clear();
|
||||
}
|
||||
else if (isOpen)
|
||||
|
||||
if (tag.size() == 0)
|
||||
{
|
||||
//this is an open tag
|
||||
tagName = nextTagName;
|
||||
altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||");
|
||||
altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||");
|
||||
std::string span = ParseXmlTagAttribute(tagContents,"span");
|
||||
tagStart = curWord;
|
||||
if (isUnary) {
|
||||
numUnary++;
|
||||
if (span.empty()) {
|
||||
TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
std::vector<std::string> ij = Tokenize(span, ",");
|
||||
if (ij.size() != 2) {
|
||||
TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl);
|
||||
return false;
|
||||
}
|
||||
tagStart = atoi(ij[0].c_str());
|
||||
tagEnd = atoi(ij[1].c_str());
|
||||
if (tagEnd < tagStart) {
|
||||
TRACE_ERR("ERROR: span tag " << span << " invalid" << endl);
|
||||
return false;
|
||||
}
|
||||
doClose = true;
|
||||
VERBOSE(3,"XML TAG IS UNARY" << endl);
|
||||
}
|
||||
VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
|
||||
VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
|
||||
VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
|
||||
VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl);
|
||||
if (altTexts.size() != altProbs.size()) {
|
||||
TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
|
||||
{
|
||||
//mismatched tag, abort!
|
||||
TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
|
||||
TRACE_ERR("ERROR: empty tag name: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
doClose = true;
|
||||
tagEnd = curWord-1; //size is inclusive
|
||||
}
|
||||
if (doClose) {
|
||||
VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
|
||||
VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl);
|
||||
//store translation options into members
|
||||
|
||||
if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
|
||||
for (size_t i=0; i<altTexts.size(); ++i) {
|
||||
//only store options if we aren't ignoring them
|
||||
//set default probability
|
||||
float probValue = 1;
|
||||
if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
|
||||
//Convert from prob to log-prob
|
||||
float scoreValue = FloorScore(TransformScore(probValue));
|
||||
|
||||
WordsRange range(tagStart,tagEnd);
|
||||
TargetPhrase targetPhrase(Output);
|
||||
targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);
|
||||
targetPhrase.SetScore(scoreValue);
|
||||
|
||||
XmlOption *option = new XmlOption(range,targetPhrase);
|
||||
assert(option);
|
||||
|
||||
if (isLinked) {
|
||||
//puch all linked items as one column in our list of xmloptions
|
||||
linkedOptions.push_back(option);
|
||||
} else {
|
||||
//push one-item list (not linked to anything)
|
||||
std::vector<XmlOption*> optList(0);
|
||||
optList.push_back(option);
|
||||
res.push_back(optList);
|
||||
// check if unary (e.g., "<wall/>")
|
||||
bool isUnary = ( tag[tag.size() - 1] == '/' );
|
||||
|
||||
// check if opening tag (e.g. "<a>", not "</a>")g
|
||||
bool isClosed = ( tag[0] == '/' );
|
||||
bool isOpen = !isClosed;
|
||||
|
||||
if (isClosed && isUnary)
|
||||
{
|
||||
TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isClosed)
|
||||
tag = tag.substr(1); // remove "/" at the beginning
|
||||
if (isUnary)
|
||||
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
|
||||
|
||||
// find the tag name and contents
|
||||
string::size_type endOfName = tag.find_first_of(' ');
|
||||
string tagName = tag;
|
||||
string tagContent = "";
|
||||
if (endOfName != string::npos) {
|
||||
tagName = tag.substr(0,endOfName);
|
||||
tagContent = tag.substr(endOfName+1);
|
||||
}
|
||||
|
||||
// *** process new tag ***
|
||||
|
||||
if (isOpen || isUnary)
|
||||
{
|
||||
// special case: linked tag turns on linked flag
|
||||
if (tagName == "linked")
|
||||
{
|
||||
if (isLinked)
|
||||
{
|
||||
TRACE_ERR("ERROR: second linked tag opened before first one closed: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
isLinked = true;
|
||||
}
|
||||
// put the tag on the tag stack
|
||||
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
|
||||
tagStack.push_back( openedTag );
|
||||
VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
|
||||
}
|
||||
|
||||
// *** process completed tag ***
|
||||
|
||||
if (isClosed || isUnary)
|
||||
{
|
||||
// pop last opened tag from stack;
|
||||
if (tagStack.size() == 0)
|
||||
{
|
||||
TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
|
||||
return false;
|
||||
}
|
||||
OpenedTag openedTag = tagStack.back();
|
||||
tagStack.pop_back();
|
||||
|
||||
// tag names have to match
|
||||
if (openedTag.first != tagName)
|
||||
{
|
||||
TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
|
||||
return false;
|
||||
}
|
||||
|
||||
// assemble remaining information about tag
|
||||
size_t startPos = openedTag.second.first;
|
||||
string tagContent = openedTag.second.second;
|
||||
size_t endPos = wordPos;
|
||||
|
||||
// span attribute overwrites position
|
||||
string span = ParseXmlTagAttribute(tagContent,"span");
|
||||
if (! span.empty())
|
||||
{
|
||||
vector<string> ij = Tokenize(span, ",");
|
||||
if (ij.size() != 1 && ij.size() != 2) {
|
||||
TRACE_ERR("ERROR: span attribute must be of the form \"i,j\" or \"i\": " << line << endl);
|
||||
return false;
|
||||
}
|
||||
startPos = atoi(ij[0].c_str());
|
||||
if (ij.size() == 1) endPos = startPos;
|
||||
else endPos = atoi(ij[1].c_str()) + 1;
|
||||
}
|
||||
|
||||
VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
|
||||
// special tag: <linked>
|
||||
if (tagName == "linked")
|
||||
{
|
||||
isLinked = false;
|
||||
}
|
||||
|
||||
// special tag: wall
|
||||
if (tagName == "wall")
|
||||
{
|
||||
size_t start = (startPos == 0) ? 0 : startPos-1;
|
||||
for(size_t pos = start; pos < endPos; pos++)
|
||||
walls.push_back( pos );
|
||||
}
|
||||
|
||||
// special tag: zone
|
||||
else if (tagName == "zone")
|
||||
{
|
||||
if (startPos >= endPos)
|
||||
{
|
||||
TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
reorderingConstraint.SetZone( startPos, endPos-1 );
|
||||
}
|
||||
|
||||
// default: opening tag that specifies translation options
|
||||
else
|
||||
{
|
||||
if (startPos >= endPos)
|
||||
{
|
||||
TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
|
||||
// specified translations -> vector of phrases
|
||||
// multiple translations may be specified, separated by "||"
|
||||
vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
|
||||
if( altTexts.size() == 1 && altTexts[0] == "" )
|
||||
altTexts.pop_back(); // happens when nothing specified
|
||||
// deal with legacy annotations: "translation" was called "english"
|
||||
vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
|
||||
if (moreAltTexts.size()>1 || moreAltTexts[0] != "")
|
||||
{
|
||||
for(vector<string>::iterator translation=moreAltTexts.begin();
|
||||
translation != moreAltTexts.end();
|
||||
translation++)
|
||||
{
|
||||
string t = *translation;
|
||||
altTexts.push_back( t );
|
||||
}
|
||||
}
|
||||
|
||||
// specified probabilities for the translations -> vector of probs
|
||||
vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
|
||||
if( altProbs.size() == 1 && altProbs[0] == "" )
|
||||
altProbs.pop_back(); // happens when nothing specified
|
||||
|
||||
// report what we have processed so far
|
||||
VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
|
||||
VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
|
||||
VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
|
||||
VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
|
||||
if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
|
||||
TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
|
||||
// store translation options into members
|
||||
if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
|
||||
// only store options if we aren't ignoring them
|
||||
for (size_t i=0; i<altTexts.size(); ++i) {
|
||||
// set default probability
|
||||
float probValue = 1;
|
||||
if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
|
||||
// convert from prob to log-prob
|
||||
float scoreValue = FloorScore(TransformScore(probValue));
|
||||
|
||||
WordsRange range(startPos,endPos-1); // span covered by phrase
|
||||
TargetPhrase targetPhrase(Output);
|
||||
targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);
|
||||
targetPhrase.SetScore(scoreValue);
|
||||
|
||||
XmlOption *option = new XmlOption(range,targetPhrase);
|
||||
assert(option);
|
||||
|
||||
if (isLinked)
|
||||
{
|
||||
// push all linked items as one column in our list of xmloptions
|
||||
linkedOptions.push_back(option);
|
||||
}
|
||||
else
|
||||
{
|
||||
// push one-item list (not linked to anything)
|
||||
vector<XmlOption*> optList(0);
|
||||
optList.push_back(option);
|
||||
res.push_back(optList);
|
||||
}
|
||||
}
|
||||
altTexts.clear();
|
||||
altProbs.clear();
|
||||
}
|
||||
}
|
||||
tagName= "";
|
||||
tagContents = "";
|
||||
altTexts.clear();
|
||||
altProbs.clear();
|
||||
doClose = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
line = rstr;
|
||||
// we are done. check if there are tags that are still open
|
||||
if (tagStack.size() > 0)
|
||||
{
|
||||
TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
|
||||
// return de-xml'ed sentence in line
|
||||
line = cleanLine;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <string>
|
||||
#include "WordsRange.h"
|
||||
#include "TargetPhrase.h"
|
||||
#include "ReorderingConstraint.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -22,7 +23,7 @@ struct XmlOption {
|
||||
|
||||
};
|
||||
|
||||
bool ProcessAndStripXMLTags(std::string &line,std::vector<std::vector<XmlOption*> > &res);
|
||||
bool ProcessAndStripXMLTags(std::string &line,std::vector<std::vector<XmlOption*> > &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls );
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user