initial version of reordering zones and walls, may work

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1960 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2008-12-15 12:52:38 +00:00
parent e13e45dc63
commit a360b71426
13 changed files with 599 additions and 268 deletions

View File

@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h"
#include "Phrase.h"
#include "TargetPhraseCollection.h"
#include "ReorderingConstraint.h"
namespace Moses
{
@ -42,6 +43,7 @@ protected:
long m_translationId; //< contiguous Id
bool m_hasMetaData;
long m_segId;
ReorderingConstraint m_reorderingConstraint; /**< limits on reordering specified either by "-mp" switch or xml tags */
public:
@ -112,6 +114,12 @@ public:
//! return substring at a particular position. Only valid for Sentence class. TODO - get rid of this fn
virtual const Word& GetWord(size_t pos) const=0;
//! Returns the reordering constraints
const ReorderingConstraint& GetReorderingConstraint() const
{
return m_reorderingConstraint;
};
TO_STRING();
};

View File

@ -22,16 +22,59 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ReorderingConstraint.h"
#include "InputType.h"
#include "Word.h"
#include "StaticData.h"
namespace Moses
{
void ReorderingConstraint::SetWall( const InputType& sentence )
//! allocate memory for reordering walls
void ReorderingConstraint::InitializeWalls(size_t size)
{
m_size = size;
m_wall = (bool*) malloc(sizeof(bool) * size);
m_localWall = (bool*) malloc(sizeof(bool) * size);
for (size_t pos = 0 ; pos < m_size ; pos++)
{
m_wall[pos] = false;
m_localWall[pos] = false;
}
}
//! set value at a particular position
void ReorderingConstraint::SetWall( size_t pos, bool value )
{
VERBOSE(3,"SETTING reordering wall at position " << pos << std::endl);
m_wall[pos] = value;
m_active = true;
}
//! has to be called to localized walls
void ReorderingConstraint::FinalizeWalls()
{
for(size_t z = 0; z < m_zone.size(); z++ )
{
const size_t startZone = m_zone[z][0];
const size_t endZone = m_zone[z][1];// note: wall after endZone is not local
for( size_t pos = startZone; pos < endZone; pos++ )
{
if (m_wall[ pos ])
{
m_localWall[ pos ] = true;
m_wall[ pos ] = false;
VERBOSE(3,"SETTING local wall " << pos << std::endl);
}
}
}
}
//! set walls based on "-monotone-at-punctuation" flag
void ReorderingConstraint::SetMonotoneAtPunctuation( const Phrase &sentence )
{
for( size_t i=0; i<sentence.GetSize(); i++ )
{
const Word& word = sentence.GetWord( i );
const Word& word = sentence.GetWord(i);
if (word[0]->GetString() == "," ||
word[0]->GetString() == "." ||
word[0]->GetString() == "!" ||
@ -40,22 +83,161 @@ void ReorderingConstraint::SetWall( const InputType& sentence )
word[0]->GetString() == ";" ||
word[0]->GetString() == "\"")
{
// std::cerr << "SETTING reordering wall at position " << i << std::endl;
SetValue( i, true );
// set wall before and after punc, but not at sentence start, end
if (i>0 && i<m_size-1) SetWall( i, true );
if (i>1) SetWall( i-1, true );
}
}
}
bool ReorderingConstraint::ContainsWall( size_t start, size_t end ) const
//! set a reordering zone (once entered, need to finish)
void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
{
for( size_t i=start; i<=end; i++ )
VERBOSE(3,"SETTING zone " << startPos << "-" << endPos << std::endl);
std::vector< size_t > newZone;
newZone.push_back( startPos );
newZone.push_back( endPos );
m_zone.push_back( newZone );
m_active = true;
}
//! check if the current hypothesis extension violates reordering constraints
bool ReorderingConstraint::Check( const WordsBitmap &bitmap, size_t startPos, size_t endPos ) const
{
// nothing to be checked, we are done
if (! IsActive() ) return true;
VERBOSE(3,"CHECK " << bitmap << " " << startPos << "-" << endPos);
// check walls
size_t firstGapPos = bitmap.GetFirstGapPos();
// filling first gap -> no wall violation possible
if (firstGapPos != startPos)
{
if ( GetWall( i ) ) {
// std::cerr << "HITTING reordering wall at position " << i << std::endl;
return true;
// if there is a wall before the last word,
// we created a gap while moving through wall
// -> violation
for( size_t pos = firstGapPos; pos < endPos; pos++ )
{
if( GetWall( pos ) )
{
VERBOSE(3," hitting wall " << pos << std::endl);
return false;
}
}
}
return false;
// monotone -> no violation possible
size_t lastPos = bitmap.GetLastPos();
if ((lastPos == NOT_FOUND && startPos == 0) ||
(firstGapPos > lastPos && firstGapPos == startPos))
{
VERBOSE(3," montone, fine." << std::endl);
return true;
}
// check zones
for(size_t z = 0; z < m_zone.size(); z++ )
{
const size_t startZone = m_zone[z][0];
const size_t endZone = m_zone[z][1];
// fine, if translation has not reached zone yet and phrase outside zone
if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
continue;
}
// already completely translated zone, no violations possible
if (firstGapPos > endZone)
{
continue;
}
// some words are translated beyond the start
// let's look closer if some are in the zone
size_t numWordsInZoneTranslated = 0;
if (lastPos >= startZone)
{
for(size_t pos = startZone; pos <= endZone; pos++ )
{
if( bitmap.GetValue( pos ) )
{
numWordsInZoneTranslated++;
}
}
}
// all words in zone translated, no violation possible
if (numWordsInZoneTranslated == endZone-startZone+1)
{
continue;
}
// flag if this is an active zone
bool activeZone = (numWordsInZoneTranslated > 0);
// fine, if zone completely untranslated and phrase outside zone
if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
continue;
}
// violation, if phrase completely outside active zone
if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
VERBOSE(3," outside active zone" << std::endl);
return false;
}
// ok, this is what we know now:
// * the phrase is in the zone (at least partially)
// * either zone is already active, or it becomes active now
// let us check on phrases that are partially outside
// phrase overlaps at the beginning, always ok
if (startPos <= startZone)
{
continue;
}
// phrase goes beyond end, has to fill zone completely
if (endPos > endZone)
{
if (endZone-startPos+1 < // num. words filled in by phrase
endZone-startZone+1-numWordsInZoneTranslated) // num. untranslated
{
VERBOSE(3," overlap end, but not completing" << std::endl);
return false;
}
else
{
continue;
}
}
// now we are down to phrases that are completely inside the zone
// we have to check local walls
bool seenUntranslatedBeforeStartPos = false;
for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ )
{
// be careful when there is a gap before phrase
if( !bitmap.GetValue( pos ) // untranslated word
&& pos < startPos ) // before startPos
{
seenUntranslatedBeforeStartPos = true;
}
if( seenUntranslatedBeforeStartPos && GetLocalWall( pos ) )
{
VERBOSE(3," local wall violation" << std::endl);
return false;
}
}
// passed all checks for this zone, on to the next one
}
// passed all checks, no violations
VERBOSE(3," fine." << std::endl);
return true;
}
}

View File

@ -29,6 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <cstring>
#include <cmath>
#include "TypeDef.h"
#include "Word.h"
#include "Phrase.h"
namespace Moses
{
@ -42,46 +44,52 @@ class ReorderingConstraint
protected:
// const size_t m_size; /**< number of words in sentence */
size_t m_size; /**< number of words in sentence */
bool *m_bitmap; /**< flag for each word if it is a wall */
bool *m_wall; /**< flag for each word if it is a wall */
bool *m_localWall; /**< flag for each word if it is a local wall */
std::vector< std::vector< size_t > > m_zone; /** zones that limit reordering */
bool m_active; /**< flag indicating, if there are any active constraints */
public:
//! create ReorderingConstraint of length size and initialise to zero
ReorderingConstraint(size_t size)
:m_size (size)
{
m_bitmap = (bool*) malloc(sizeof(bool) * size);
ReorderingConstraint() :m_wall(NULL),m_localWall(NULL),m_active(false) {}
for (size_t pos = 0 ; pos < m_size ; pos++)
{
m_bitmap[pos] = false;
}
//! destructer
~ReorderingConstraint()
{
if (m_wall != NULL) free(m_wall);
if (m_localWall != NULL) free(m_localWall);
}
~ReorderingConstraint()
{
free(m_bitmap);
}
//! allocate memory for memory for a sentence of a given size
void InitializeWalls(size_t size);
//! whether a word has been translated at a particular position
bool GetWall(size_t pos) const
{
return m_bitmap[pos];
}
//! changes walls in zones into local walls
void FinalizeWalls();
//! set value at a particular position
void SetValue( size_t pos, bool value )
{
m_bitmap[pos] = value;
}
void SetWall( size_t pos, bool value );
//! set the reordering wall based on the words in the sentence
void SetWall( const InputType& sentence );
//! whether a word has been translated at a particular position
bool GetWall(size_t pos) const { return m_wall[pos]; }
//! checks if there is a wall in the interval [start,end]
bool ContainsWall( size_t start, size_t end ) const;
//! whether a word has been translated at a particular position
bool GetLocalWall(size_t pos) const { return m_localWall[pos]; }
//! set a zone
void SetZone( size_t startPos, size_t endPos );
//! returns the vector of zones
std::vector< std::vector< size_t > > & GetZones() { return m_zone; }
//! set the reordering walls based on punctuation in the sentence
void SetMonotoneAtPunctuation( const Phrase & sentence );
//! check if all constraints are fulfilled -> all find
bool Check( const WordsBitmap &bitmap, size_t start, size_t end ) const;
//! checks if reordering constraints will be enforced
bool IsActive() const { return m_active; }
};
}

View File

@ -62,13 +62,6 @@ SearchCubePruning::SearchCubePruning(const InputType &source, const TranslationO
m_hypoStackColl[ind] = sourceHypoColl;
}
// set additional reordering constraints, if specified
if (staticData.UseReorderingConstraint())
{
m_reorderingConstraint = new ReorderingConstraint( m_source.GetSize() );
m_reorderingConstraint->SetWall( m_source );
}
}
SearchCubePruning::~SearchCubePruning()
@ -239,24 +232,24 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
// since we check for reordering limits, its good to have that limit handy
int maxDistortion = StaticData::Instance().GetMaxDistortion();
// no limit of reordering: no prob
// if there are reordering limits, make sure it is not violated
// the coverage bitmap is handy here (and the position of the first gap)
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
, startPos = range.GetStartPos()
, endPos = range.GetEndPos();
// if reordering constraints are used (--monotone-at-punctuation or xml), check if passes all
if (! m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
{
return false;
}
// no limit of reordering: no problem
if (maxDistortion < 0)
{
return true;
}
// if there are reordering limits, make sure it is not violated
// the coverage bitmap is handy here (and the position of the first gap)
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
, sourceSize = m_source.GetSize()
, startPos = range.GetStartPos()
, endPos = range.GetEndPos();
// MAIN LOOP. go through each possible hypo
size_t maxSize = sourceSize - startPos;
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
maxSize = std::min(maxSize, maxSizePhrase);
bool leftMostEdge = (hypoFirstGapPos == startPos);
// any length extension is okay if starting at left-most edge
if (leftMostEdge)
@ -264,37 +257,21 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
return true;
}
// starting somewhere other than left-most edge, use caution
else
{
// the basic idea is this: we would like to translate a phrase starting
// from a position further right than the left-most open gap. The
// distortion penalty for the following phrase will be computed relative
// to the ending position of the current extension, so we ask now what
// its maximum value will be (which will always be the value of the
// hypothesis starting at the left-most edge). If this vlaue is than
// the distortion limit, we don't allow this extension to be made.
WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
int required_distortion =
m_source.ComputeDistortionDistance(range, bestNextExtension);
if (required_distortion > maxDistortion) {
return false;
}
// if reordering walls are used (--monotone-at-punctuation), check here if
// there is a wall between the beginning of the gap and the end
// of this new phrase (jumping the wall).
if ( StaticData::Instance().UseReorderingConstraint() ) {
if ( m_reorderingConstraint->ContainsWall( hypoFirstGapPos, endPos ) )
return false;
}
return true;
// the basic idea is this: we would like to translate a phrase starting
// from a position further right than the left-most open gap. The
// distortion penalty for the following phrase will be computed relative
// to the ending position of the current extension, so we ask now what
// its maximum value will be (which will always be the value of the
// hypothesis starting at the left-most edge). If this vlaue is than
// the distortion limit, we don't allow this extension to be made.
WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
int required_distortion =
m_source.ComputeDistortionDistance(range, bestNextExtension);
if (required_distortion > maxDistortion) {
return false;
}
return false;
return true;
}
/**
@ -332,7 +309,6 @@ void SearchCubePruning::PrintBitmapContainerGraph()
{
cerr << iterAccessor->first << endl;
BitmapContainer &container = *iterAccessor->second;
}
}

View File

@ -4,7 +4,6 @@
#include <vector>
#include "Search.h"
#include "HypothesisStackCubePruning.h"
#include "ReorderingConstraint.h"
namespace Moses
{
@ -21,7 +20,6 @@ protected:
TargetPhrase m_initialTargetPhrase; /**< used to seed 1st hypo */
clock_t m_start; /**< used to track time spend on translation */
const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
ReorderingConstraint *m_reorderingConstraint; /**< positions in input sentence over which no reordering is allowed */
//! go thru all bitmaps in 1 stack & create backpointers to bitmaps in the stack
void CreateForwardTodos(HypothesisStackCubePruning &stack);

View File

@ -34,13 +34,6 @@ SearchNormal::SearchNormal(const InputType &source, const TranslationOptionColle
m_hypoStackColl[ind] = sourceHypoColl;
}
// set additional reordering constraints, if specified
if (staticData.UseReorderingConstraint())
{
m_reorderingConstraint = new ReorderingConstraint( m_source.GetSize() );
m_reorderingConstraint->SetWall( m_source );
}
}
SearchNormal::~SearchNormal()
@ -129,7 +122,10 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
{
if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)))
if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)) || !m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
{
continue;
}
{
//TODO: does this method include incompatible WordLattice hypotheses?
ExpandAllHypotheses(hypothesis
@ -147,7 +143,7 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
, sourceSize = m_source.GetSize();
// MAIN LOOP. go through each possible hypo
// MAIN LOOP. go through each possible range
for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos)
{
size_t maxSize = sourceSize - startPos;
@ -178,6 +174,12 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
{
// check if passes specified reordering constraints
// (set with -monotone-at-punctuation or xml)
if (!m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
{
continue;
}
// check for overlap
WordsRange extRange(startPos, endPos);
#ifdef DEBUGLATTICE
@ -268,14 +270,8 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
continue;
}
// if reordering walls are used (--monotone-at-punctuation), check here if
// there is a wall between the beginning of the gap and the end
// of this new phrase (jumping the wall).
if ( StaticData::Instance().UseReorderingConstraint() ) {
if ( m_reorderingConstraint->ContainsWall( hypoFirstGapPos, endPos ) )
continue;
}
// everything is fine, we're good to go
ExpandAllHypotheses(hypothesis
,m_transOptColl.GetTranslationOptionList(extRange));

View File

@ -5,7 +5,6 @@
#include "Search.h"
#include "HypothesisStackNormal.h"
#include "TranslationOptionCollection.h"
#include "ReorderingConstraint.h"
#include "Timer.h"
namespace Moses
@ -25,8 +24,6 @@ protected:
size_t interrupted_flag;
HypothesisStackNormal* actual_hypoStack; /**actual (full expanded) stack of hypotheses*/
const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
ReorderingConstraint *m_reorderingConstraint; /**< positions in input sentence over which no reordering is allowed */
// functions for creating hypotheses
void ProcessOneHypothesis(const Hypothesis &hypothesis);

View File

@ -36,16 +36,19 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
if (getline(in, line, '\n').eof())
return 0;
// remove extra spaces
line = Trim(line);
meta = ProcessAndStripSGML(line);
// if sentences is specified as "<seg id=1> ... </seg>", extract id
meta = ProcessAndStripSGML(line);
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
//parse XML markup in translation line
// parse XML markup in translation line
const StaticData &staticData = StaticData::Instance();
std::vector<std::vector<XmlOption*> > xmlOptionsList(0);
std::vector< size_t > xmlWalls;
if (staticData.GetXmlInputType() != XmlPassThrough) {
if (!ProcessAndStripXMLTags(line, xmlOptionsList)) {
if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls )) {
TRACE_ERR("Unable to parse XML in line " << line);
abort();
}
@ -107,6 +110,21 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
}
}
m_reorderingConstraint.InitializeWalls( GetSize() );
// set reordering walls, if "-monotone-at-punction" is set
if (staticData.UseReorderingConstraint())
{
m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) );
}
// set walls obtained from xml
for(size_t i=0; i<xmlWalls.size(); i++)
if( xmlWalls[i] < GetSize() ) // no buggy walls, please
m_reorderingConstraint.SetWall( xmlWalls[i], true );
m_reorderingConstraint.FinalizeWalls();
return 1;
}

View File

@ -86,7 +86,6 @@ class Sentence : public Phrase, public InputType
//! populates vector argument with XML force translation options for the specific range passed
void GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const;
int Read(std::istream& in,const std::vector<FactorType>& factorOrder);
void Print(std::ostream& out) const;

View File

@ -543,7 +543,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
*/
bool TranslationOptionCollection::HasXmlOptionsOverlappingRange(size_t, size_t) const {
return false;
//not implemented for base class
}
/** Populates the current Collection with XML options exactly covering the range specified. Default implementation does nothing.

View File

@ -196,7 +196,6 @@ public:
//! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
WordsBitmapID GetID() const {
std::cerr << "GetID()\n";
assert(m_size < (1<<16));
size_t start = GetFirstGapPos();

View File

@ -32,225 +32,374 @@
namespace Moses
{
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
string ParseXmlTagAttribute(const string& tag,const string& attributeName){
/*TODO deal with unescaping \"*/
string tagOpen = attributeName + "=\"";
size_t contentsStart = tag.find(tagOpen);
if (contentsStart == std::string::npos) return "";
if (contentsStart == string::npos) return "";
contentsStart += tagOpen.size();
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
if (contentsEnd == std::string::npos) {
if (contentsEnd == string::npos) {
TRACE_ERR("Malformed XML attribute: "<< tag);
return "";
}
size_t possibleEnd;
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
contentsEnd = possibleEnd;
}
return tag.substr(contentsStart,contentsEnd-contentsStart);
}
std::string TrimXml(const std::string& str) {
/**
* Remove "<" and ">" from XML tag
*
* \param str xml token to be stripped
*/
string TrimXml(const string& str)
{
// too short to be xml token -> do nothing
if (str.size() < 2) return str;
if (str[0] == '<' && str[str.size() - 1] == '>') {
// strip first and last character
if (str[0] == '<' && str[str.size() - 1] == '>')
{
return str.substr(1, str.size() - 2);
} else { return str; }
}
// not an xml token -> do nothing
else { return str; }
}
bool isXmlTag(const std::string& tag)
/**
* Check if the token is an XML tag, i.e. starts with "<"
*
* \param tag token to be checked
*/
bool isXmlTag(const string& tag)
{
return tag[0] == '<';
}
inline std::vector<std::string> TokenizeXml(const std::string& str)
/**
* Split up the input character string into tokens made up of
* either XML tags or text.
* example: this <b> is a </b> test .
* => (this ), (<b>), ( is a ), (</b>), ( test .)
*
* \param str input string
*/
inline vector<string> TokenizeXml(const string& str)
{
std::string lbrack = "<";
std::string rbrack = ">";
std::vector<std::string> tokens;
// Find first "non-delimiter".
std::string::size_type cpos = 0;
std::string::size_type lpos = 0;
std::string::size_type rpos = 0;
string lbrack = "<";
string rbrack = ">";
vector<string> tokens; // vector of tokens to be returned
string::size_type cpos = 0; // current position in string
string::size_type lpos = 0; // left start of xml tag
string::size_type rpos = 0; // right end of xml tag
while (cpos != str.size()) {
// walk thorugh the string (loop vver cpos)
while (cpos != str.size())
{
// find the next opening "<" of an xml tag
lpos = str.find_first_of(lbrack, cpos);
if (lpos != std::string::npos) {
if (lpos != string::npos)
{
// find the end of the xml tag
rpos = str.find_first_of(rbrack, lpos);
if (rpos == std::string::npos) {
// sanity check: there has to be closing ">"
if (rpos == string::npos)
{
TRACE_ERR("ERROR: malformed XML: " << str << endl);
return tokens;
}
} else {
}
else // no more tags found
{
// add the rest as token
tokens.push_back(str.substr(cpos));
break;
}
// add stuff before xml tag as token, if there is any
if (lpos - cpos > 0)
tokens.push_back(str.substr(cpos, lpos - cpos));
// add xml tag as token
tokens.push_back(str.substr(lpos, rpos-lpos+1));
cpos = rpos + 1;
}
return tokens;
}
/**
* Process a sentence with xml annotation
* Xml tags may specifiy additional/replacing translation options
* and reordering constraints
*
* \param line in: sentence, out: sentence without the xml
* \param res vector with translation options specified by xml
* \param reorderingConstraint reordering constraint zones specified by xml
* \param walls reordering constraint walls specified by xml
*/
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
is so we can link things up afterwards. We can't create TranslationOptions as we
parse because we don't have the completed source parsed until after this function
removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(std::string &line, std::vector<std::vector<XmlOption*> > &res) {
bool ProcessAndStripXMLTags(string &line, vector<vector<XmlOption*> > &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls ) {
//parse XML markup in translation line
if (line.find_first_of('<') == std::string::npos) { return true; }
// no xml tag? we're done.
if (line.find_first_of('<') == string::npos) { return true; }
std::string rstr;
std::vector<std::string> xmlTokens = TokenizeXml(line);
std::string tagName = "";
std::string tagContents = "";
std::vector<std::string> altTexts;
std::vector<std::string> altProbs;
std::vector<XmlOption*> linkedOptions;
size_t tagStart=0;
size_t tagEnd=0;
size_t curWord=0;
int numUnary = 0;
bool doClose = false;
bool isLinked = false;
const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
const std::string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
// break up input into a vector of xml tags and text
// example: (this), (<b>), (is a), (</b>), (test .)
vector<string> xmlTokens = TokenizeXml(line);
// we need to store opened tags, until they are closed
// tags are stored as tripled (tagname, startpos, contents)
typedef pair< string, pair< size_t, string > > OpenedTag;
vector< OpenedTag > tagStack; // stack that contains active opened tags
string cleanLine; // return string (text without xml)
vector<XmlOption*> linkedOptions;
size_t wordPos = 0; // position in sentence (in terms of number of words)
bool isLinked = false;
const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
// loop through the tokens
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
{
// not a xml tag, but regular text (may contain many words)
if(!isXmlTag(xmlTokens[xmlTokenPos]))
{
//phrase, not tag. token may contain many words
rstr += xmlTokens[xmlTokenPos];
curWord = Tokenize(rstr).size();
// add a space at boundary, if necessary
if (cleanLine.size()>0 &&
cleanLine[cleanLine.size() - 1] != ' ' &&
xmlTokens[xmlTokenPos][0] != ' ')
{
cleanLine += " ";
}
cleanLine += xmlTokens[xmlTokenPos]; // add to output
wordPos = Tokenize(cleanLine).size(); // count all the words
}
// process xml tag
else
{
//tag data
std::string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
// *** get essential information about tag ***
// strip extra boundary spaces and "<" and ">"
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
VERBOSE(3,"XML TAG IS: " << tag << std::endl);
std::string::size_type endOfName = tag.find_first_of(' ');
std::string nextTagName = tag;
bool isUnary = tag[tag.size() - 1] == '/';
bool isOpen = tag[0] != '/';
if (endOfName != std::string::npos) {
nextTagName = tag.substr(0,endOfName);
tagContents = tag.substr(endOfName+1);
}
if (nextTagName == "linked") {
//just set a flag, don't try to process
if (tagName != "") {
TRACE_ERR("ERROR: tried to start linked XML tag while \""<< tagName <<"\" tag still open: " << line << endl);
return false;
}
if (linkedOptions.size()>0) {
TRACE_ERR("ERROR: tried to start second linked XML tag while linked still open: " << line << endl);
return false;
}
isLinked = true;
isOpen = false;
}
else if (nextTagName == "/linked") {
isLinked = false;
//can't be in an open tag when we stop linking
if (tagName != "") {
TRACE_ERR("ERROR: tried to close linked XML tag while \""<< tagName <<"\" tag still open: " << line << endl);
return false;
}
res.push_back(linkedOptions);
linkedOptions.clear();
}
else if (isOpen)
if (tag.size() == 0)
{
//this is an open tag
tagName = nextTagName;
altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||");
altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||");
std::string span = ParseXmlTagAttribute(tagContents,"span");
tagStart = curWord;
if (isUnary) {
numUnary++;
if (span.empty()) {
TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl);
return false;
}
std::vector<std::string> ij = Tokenize(span, ",");
if (ij.size() != 2) {
TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl);
return false;
}
tagStart = atoi(ij[0].c_str());
tagEnd = atoi(ij[1].c_str());
if (tagEnd < tagStart) {
TRACE_ERR("ERROR: span tag " << span << " invalid" << endl);
return false;
}
doClose = true;
VERBOSE(3,"XML TAG IS UNARY" << endl);
}
VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl);
if (altTexts.size() != altProbs.size()) {
TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
return false;
}
}
else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
{
//mismatched tag, abort!
TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
TRACE_ERR("ERROR: empty tag name: " << line << endl);
return false;
}
else {
doClose = true;
tagEnd = curWord-1; //size is inclusive
}
if (doClose) {
VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl);
//store translation options into members
if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
for (size_t i=0; i<altTexts.size(); ++i) {
//only store options if we aren't ignoring them
//set default probability
float probValue = 1;
if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
//Convert from prob to log-prob
float scoreValue = FloorScore(TransformScore(probValue));
WordsRange range(tagStart,tagEnd);
TargetPhrase targetPhrase(Output);
targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);
targetPhrase.SetScore(scoreValue);
XmlOption *option = new XmlOption(range,targetPhrase);
assert(option);
if (isLinked) {
//puch all linked items as one column in our list of xmloptions
linkedOptions.push_back(option);
} else {
//push one-item list (not linked to anything)
std::vector<XmlOption*> optList(0);
optList.push_back(option);
res.push_back(optList);
// check if unary (e.g., "<wall/>")
bool isUnary = ( tag[tag.size() - 1] == '/' );
// check if opening tag (e.g. "<a>", not "</a>")g
bool isClosed = ( tag[0] == '/' );
bool isOpen = !isClosed;
if (isClosed && isUnary)
{
TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
return false;
}
if (isClosed)
tag = tag.substr(1); // remove "/" at the beginning
if (isUnary)
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
// find the tag name and contents
string::size_type endOfName = tag.find_first_of(' ');
string tagName = tag;
string tagContent = "";
if (endOfName != string::npos) {
tagName = tag.substr(0,endOfName);
tagContent = tag.substr(endOfName+1);
}
// *** process new tag ***
if (isOpen || isUnary)
{
// special case: linked tag turns on linked flag
if (tagName == "linked")
{
if (isLinked)
{
TRACE_ERR("ERROR: second linked tag opened before first one closed: " << line << endl);
return false;
}
isLinked = true;
}
// put the tag on the tag stack
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
tagStack.push_back( openedTag );
VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
}
// *** process completed tag ***
if (isClosed || isUnary)
{
// pop last opened tag from stack;
if (tagStack.size() == 0)
{
TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
return false;
}
OpenedTag openedTag = tagStack.back();
tagStack.pop_back();
// tag names have to match
if (openedTag.first != tagName)
{
TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
return false;
}
// assemble remaining information about tag
size_t startPos = openedTag.second.first;
string tagContent = openedTag.second.second;
size_t endPos = wordPos;
// span attribute overwrites position
string span = ParseXmlTagAttribute(tagContent,"span");
if (! span.empty())
{
vector<string> ij = Tokenize(span, ",");
if (ij.size() != 1 && ij.size() != 2) {
TRACE_ERR("ERROR: span attribute must be of the form \"i,j\" or \"i\": " << line << endl);
return false;
}
startPos = atoi(ij[0].c_str());
if (ij.size() == 1) endPos = startPos;
else endPos = atoi(ij[1].c_str()) + 1;
}
VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
// special tag: <linked>
if (tagName == "linked")
{
isLinked = false;
}
// special tag: wall
if (tagName == "wall")
{
size_t start = (startPos == 0) ? 0 : startPos-1;
for(size_t pos = start; pos < endPos; pos++)
walls.push_back( pos );
}
// special tag: zone
else if (tagName == "zone")
{
if (startPos >= endPos)
{
TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
return false;
}
reorderingConstraint.SetZone( startPos, endPos-1 );
}
// default: opening tag that specifies translation options
else
{
if (startPos >= endPos)
{
TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
return false;
}
// specified translations -> vector of phrases
// multiple translations may be specified, separated by "||"
vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
if( altTexts.size() == 1 && altTexts[0] == "" )
altTexts.pop_back(); // happens when nothing specified
// deal with legacy annotations: "translation" was called "english"
vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
if (moreAltTexts.size()>1 || moreAltTexts[0] != "")
{
for(vector<string>::iterator translation=moreAltTexts.begin();
translation != moreAltTexts.end();
translation++)
{
string t = *translation;
altTexts.push_back( t );
}
}
// specified probabilities for the translations -> vector of probs
vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
if( altProbs.size() == 1 && altProbs[0] == "" )
altProbs.pop_back(); // happens when nothing specified
// report what we have processed so far
VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
return false;
}
// store translation options into members
if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
// only store options if we aren't ignoring them
for (size_t i=0; i<altTexts.size(); ++i) {
// set default probability
float probValue = 1;
if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
// convert from prob to log-prob
float scoreValue = FloorScore(TransformScore(probValue));
WordsRange range(startPos,endPos-1); // span covered by phrase
TargetPhrase targetPhrase(Output);
targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);
targetPhrase.SetScore(scoreValue);
XmlOption *option = new XmlOption(range,targetPhrase);
assert(option);
if (isLinked)
{
// push all linked items as one column in our list of xmloptions
linkedOptions.push_back(option);
}
else
{
// push one-item list (not linked to anything)
vector<XmlOption*> optList(0);
optList.push_back(option);
res.push_back(optList);
}
}
altTexts.clear();
altProbs.clear();
}
}
tagName= "";
tagContents = "";
altTexts.clear();
altProbs.clear();
doClose = false;
}
}
}
line = rstr;
// we are done. check if there are tags that are still open
if (tagStack.size() > 0)
{
TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
return false;
}
// return de-xml'ed sentence in line
line = cleanLine;
return true;
}

View File

@ -4,6 +4,7 @@
#include <string>
#include "WordsRange.h"
#include "TargetPhrase.h"
#include "ReorderingConstraint.h"
namespace Moses
{
@ -22,7 +23,7 @@ struct XmlOption {
};
bool ProcessAndStripXMLTags(std::string &line,std::vector<std::vector<XmlOption*> > &res);
bool ProcessAndStripXMLTags(std::string &line,std::vector<std::vector<XmlOption*> > &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls );
}