initial version of reordering zones and walls, may work

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1960 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-27 14:05:29 +03:00 · 2008-12-15 12:52:38 +00:00 · 2008-12-15 12:52:38 +00:00 · a360b71426
commit a360b71426
parent e13e45dc63
13 changed files with 599 additions and 268 deletions
--- a/moses/src/InputType.h
+++ b/moses/src/InputType.h
@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "TypeDef.h"
 #include "Phrase.h"
 #include "TargetPhraseCollection.h"
+#include "ReorderingConstraint.h"

 namespace Moses
 {
@ -42,6 +43,7 @@ protected:
 	long m_translationId; 	//< contiguous Id
 	bool m_hasMetaData;
 	long m_segId;
+	ReorderingConstraint m_reorderingConstraint; /**< limits on reordering specified either by "-mp" switch or xml tags */
 
 public:

@ -112,6 +114,12 @@ public:
 	//! return substring at a particular position. Only valid for Sentence class. TODO - get rid of this fn
 	virtual const Word& GetWord(size_t pos) const=0;

+	//! Returns the reordering constraints
+	const ReorderingConstraint& GetReorderingConstraint() const
+	{
+		return m_reorderingConstraint;
+	};
+
 	TO_STRING();
 	
 };
--- a/moses/src/ReorderingConstraint.cpp
+++ b/moses/src/ReorderingConstraint.cpp
@ -22,16 +22,59 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

 #include "ReorderingConstraint.h"
 #include "InputType.h"
-#include "Word.h"
+#include "StaticData.h"

 namespace Moses
 {

-void ReorderingConstraint::SetWall( const InputType& sentence )
+//! allocate memory for reordering walls
+void ReorderingConstraint::InitializeWalls(size_t size)
+{
+	m_size = size;
+	m_wall      = (bool*) malloc(sizeof(bool) * size);
+	m_localWall = (bool*) malloc(sizeof(bool) * size);
+
+	for (size_t pos = 0 ; pos < m_size ; pos++)
+	{
+		m_wall[pos] = false;
+		m_localWall[pos] = false;
+	}
+}
+
+
+//! set value at a particular position
+void ReorderingConstraint::SetWall( size_t pos, bool value )
+{
+	VERBOSE(3,"SETTING reordering wall at position " << pos << std::endl);
+	m_wall[pos] = value;
+	m_active = true;
+}
+
+//! has to be called to localized walls
+void ReorderingConstraint::FinalizeWalls()
+{
+	for(size_t z = 0; z < m_zone.size(); z++ )
+	{
+		const size_t startZone = m_zone[z][0];
+		const size_t endZone = m_zone[z][1];// note: wall after endZone is not local
+		for( size_t pos = startZone; pos < endZone; pos++ )
+		{
+			if (m_wall[ pos ])
+			{
+				m_localWall[ pos ] = true;
+				m_wall[ pos ] = false;
+	      VERBOSE(3,"SETTING local wall " << pos << std::endl);
+			}
+		}
+	}
+}
+
+//! set walls based on "-monotone-at-punctuation" flag
+void ReorderingConstraint::SetMonotoneAtPunctuation( const Phrase &sentence )
 {
 	for( size_t i=0; i<sentence.GetSize(); i++ )
 	{
-		const Word& word = sentence.GetWord( i );
+		const Word& word = sentence.GetWord(i);
 		if (word[0]->GetString() == "," ||
 		    word[0]->GetString() == "." ||
 		    word[0]->GetString() == "!" ||
@ -40,22 +83,161 @@ void ReorderingConstraint::SetWall( const InputType& sentence )
 		    word[0]->GetString() == ";" ||
 		    word[0]->GetString() == "\"")
 		{
-			// std::cerr << "SETTING reordering wall at position " << i << std::endl;
-			SetValue( i, true );
+			// set wall before and after punc, but not at sentence start, end
+			if (i>0 && i<m_size-1) SetWall( i, true );
+			if (i>1)               SetWall( i-1, true );
 		}
 	}
 }

-bool ReorderingConstraint::ContainsWall( size_t start, size_t end ) const
+//! set a reordering zone (once entered, need to finish)
+void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
 {
-	for( size_t i=start; i<=end; i++ )
+	VERBOSE(3,"SETTING zone " << startPos << "-" << endPos << std::endl);
+	std::vector< size_t > newZone;
+	newZone.push_back( startPos );
+	newZone.push_back( endPos );
+	m_zone.push_back( newZone );
+	m_active = true;
+}
+
+//! check if the current hypothesis extension violates reordering constraints
+bool ReorderingConstraint::Check( const WordsBitmap &bitmap, size_t startPos, size_t endPos ) const
+{
+	// nothing to be checked, we are done
+	if (! IsActive() ) return true;
+
+	VERBOSE(3,"CHECK " << bitmap << " " << startPos << "-" << endPos);
+
+	// check walls
+	size_t firstGapPos = bitmap.GetFirstGapPos();
+	// filling first gap -> no wall violation possible
+	if (firstGapPos != startPos)
 	{
-		if ( GetWall( i ) ) {
-			// std::cerr << "HITTING reordering wall at position " << i << std::endl;
+		// if there is a wall before the last word,
+		// we created a gap while moving through wall
+		// -> violation
+		for( size_t pos = firstGapPos; pos < endPos; pos++ )
+		{
+			if( GetWall( pos ) )
+			{
+				VERBOSE(3," hitting wall " << pos << std::endl);
+				return false;
+			}
+		}
+	}
+
+	// monotone -> no violation possible
+	size_t lastPos = bitmap.GetLastPos();
+	if ((lastPos == NOT_FOUND && startPos == 0) || 
+	    (firstGapPos > lastPos && firstGapPos == startPos))
+	{
+		VERBOSE(3," montone, fine." << std::endl);
 		return true;
 	}
+
+	 // check zones
+	for(size_t z = 0; z < m_zone.size(); z++ )
+	{
+		const size_t startZone = m_zone[z][0];
+		const size_t endZone = m_zone[z][1];
+
+		// fine, if translation has not reached zone yet and phrase outside zone
+		if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
+			continue;
 		}
+
+		// already completely translated zone, no violations possible
+		if (firstGapPos > endZone)
+		{ 
+			continue;
+		}
+
+		// some words are translated beyond the start
+		// let's look closer if some are in the zone
+		size_t numWordsInZoneTranslated = 0;
+		if (lastPos >= startZone)
+		{
+			for(size_t pos = startZone; pos <= endZone; pos++ )
+			{
+				if( bitmap.GetValue( pos ) )
+				{
+					numWordsInZoneTranslated++;
+				}
+			}
+		}
+
+		// all words in zone translated, no violation possible
+		if (numWordsInZoneTranslated == endZone-startZone+1)
+		{
+			continue;
+		}
+
+		// flag if this is an active zone
+		bool activeZone = (numWordsInZoneTranslated > 0);
+		
+		// fine, if zone completely untranslated and phrase outside zone
+		if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
+			continue;
+		}
+
+		// violation, if phrase completely outside active zone
+		if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
+			VERBOSE(3," outside active zone" << std::endl);
 			return false;
+		}
+
+		// ok, this is what we know now: 
+		// * the phrase is in the zone (at least partially)
+		// * either zone is already active, or it becomes active now
+
+		// let us check on phrases that are partially outside
+
+		// phrase overlaps at the beginning, always ok
+		if (startPos <= startZone)		
+		{
+			continue;
+		}
+
+		// phrase goes beyond end, has to fill zone completely
+		if (endPos > endZone) 
+		{
+			if (endZone-startPos+1 < // num. words filled in by phrase
+			    endZone-startZone+1-numWordsInZoneTranslated) // num. untranslated
+			{
+			  VERBOSE(3," overlap end, but not completing" << std::endl);
+				return false;
+			}
+			else
+			{
+				continue;
+			}
+		}
+
+		// now we are down to phrases that are completely inside the zone
+		// we have to check local walls
+		bool seenUntranslatedBeforeStartPos = false;
+		for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ )
+		{
+			// be careful when there is a gap before phrase
+			if( !bitmap.GetValue( pos ) // untranslated word
+					&& pos < startPos )     // before startPos
+			{
+				seenUntranslatedBeforeStartPos = true;
+			}
+			if( seenUntranslatedBeforeStartPos && GetLocalWall( pos ) )
+			{
+				VERBOSE(3," local wall violation" << std::endl);
+				return false;
+			}
+		}
+	
+		// passed all checks for this zone, on to the next one
+	}
+	
+	// passed all checks, no violations
+	VERBOSE(3," fine." << std::endl);
+	return true;
 }

 }
--- a/moses/src/ReorderingConstraint.h
+++ b/moses/src/ReorderingConstraint.h
@ -29,6 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <cstring>
 #include <cmath>
 #include "TypeDef.h"
+#include "Word.h"
+#include "Phrase.h"

 namespace Moses
 {
@ -42,46 +44,52 @@ class ReorderingConstraint
 protected:
 	// const size_t m_size; /**< number of words in sentence */
 	size_t m_size; /**< number of words in sentence */
-	bool	*m_bitmap;	/**< flag for each word if it is a wall */
+	bool	*m_wall;	/**< flag for each word if it is a wall */
+	bool	*m_localWall;	/**< flag for each word if it is a local wall */
+	std::vector< std::vector< size_t > > m_zone; /** zones that limit reordering */
+	bool   m_active; /**< flag indicating, if there are any active constraints */

 public:

 	//! create ReorderingConstraint of length size and initialise to zero
-	ReorderingConstraint(size_t size)
-		:m_size	(size)
-	{
-		m_bitmap = (bool*) malloc(sizeof(bool) * size);
-
-		for (size_t pos = 0 ; pos < m_size ; pos++)
-		{
-			m_bitmap[pos] = false;
-		}
-	}
+	ReorderingConstraint() :m_wall(NULL),m_localWall(NULL),m_active(false) {}

+	//! destructer
 	~ReorderingConstraint() 
 	{ 
-		free(m_bitmap);
+		if (m_wall != NULL) free(m_wall); 
+		if (m_localWall != NULL) free(m_localWall); 
 	}

+	//! allocate memory for memory for a sentence of a given size
+	void InitializeWalls(size_t size);

-	//! whether a word has been translated at a particular position
-	bool GetWall(size_t pos) const
-	{
-		return m_bitmap[pos];
-	}
+	//! changes walls in zones into local walls
+	void FinalizeWalls();

 	//! set value at a particular position
-	void SetValue( size_t pos, bool value )
-	{
-		m_bitmap[pos] = value;
-	}
+	void SetWall( size_t pos, bool value );

-	//! set the reordering wall based on the words in the sentence
-	void SetWall( const InputType& sentence );
+	//! whether a word has been translated at a particular position
+	bool GetWall(size_t pos) const { return m_wall[pos]; }

-	//! checks if there is a wall in the interval [start,end]
-	bool ContainsWall( size_t start, size_t end ) const;
+	//! whether a word has been translated at a particular position
+	bool GetLocalWall(size_t pos) const { return m_localWall[pos]; }

+	//! set a zone
+	void SetZone( size_t startPos, size_t endPos );
+
+	//! returns the vector of zones
+  std::vector< std::vector< size_t > > & GetZones() { return m_zone; }
+
+	//! set the reordering walls based on punctuation in the sentence
+	void SetMonotoneAtPunctuation( const Phrase & sentence );
+
+	//! check if all constraints are fulfilled -> all find
+	bool Check( const WordsBitmap &bitmap, size_t start, size_t end ) const;
+
+	//! checks if reordering constraints will be enforced
+	bool IsActive() const { return m_active; }
 };

 }
--- a/moses/src/SearchCubePruning.cpp
+++ b/moses/src/SearchCubePruning.cpp
@ -62,13 +62,6 @@ SearchCubePruning::SearchCubePruning(const InputType &source, const TranslationO

 		m_hypoStackColl[ind] = sourceHypoColl;
 	}
-
-	// set additional reordering constraints, if specified
-	if (staticData.UseReorderingConstraint())
-	{
-		m_reorderingConstraint = new ReorderingConstraint( m_source.GetSize() );
-		m_reorderingConstraint->SetWall( m_source );
-	}
 }

 SearchCubePruning::~SearchCubePruning()
@ -239,24 +232,24 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
 	// since we check for reordering limits, its good to have that limit handy
 	int maxDistortion = StaticData::Instance().GetMaxDistortion();

-	// no limit of reordering: no prob
+	// if there are reordering limits, make sure it is not violated
+	// the coverage bitmap is handy here (and the position of the first gap)
+	const size_t	hypoFirstGapPos	= hypoBitmap.GetFirstGapPos()
+							, startPos				= range.GetStartPos()
+							, endPos					= range.GetEndPos();
+
+	// if reordering constraints are used (--monotone-at-punctuation or xml), check if passes all
+	if (! m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
+	{
+		return false;
+	}
+
+	// no limit of reordering: no problem
 	if (maxDistortion < 0)
 	{	
 		return true;
 	}

-	// if there are reordering limits, make sure it is not violated
-	// the coverage bitmap is handy here (and the position of the first gap)
-	const size_t	hypoFirstGapPos	= hypoBitmap.GetFirstGapPos()
-							, sourceSize			= m_source.GetSize()
-							, startPos				= range.GetStartPos()
-							, endPos					= range.GetEndPos();
-
-	// MAIN LOOP. go through each possible hypo
-  size_t maxSize = sourceSize - startPos;
-  size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
-  maxSize = std::min(maxSize, maxSizePhrase);
-
 	bool leftMostEdge = (hypoFirstGapPos == startPos);			
 	// any length extension is okay if starting at left-most edge
 	if (leftMostEdge)
@ -264,8 +257,6 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
 		return true;
 	}
 	// starting somewhere other than left-most edge, use caution
-	else
-	{
 	// the basic idea is this: we would like to translate a phrase starting
 	// from a position further right than the left-most open gap. The
 	// distortion penalty for the following phrase will be computed relative
@ -280,21 +271,7 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
 	if (required_distortion > maxDistortion) {
 		return false;
 	}
-
-		// if reordering walls are used (--monotone-at-punctuation), check here if 
-		// there is a wall between the beginning of the gap and the end
-		// of this new phrase (jumping the wall). 
-		
-		if ( StaticData::Instance().UseReorderingConstraint() ) {
-		  if ( m_reorderingConstraint->ContainsWall( hypoFirstGapPos, endPos ) )
-		    return false;
-		}
-		
 	return true;
-
-	}
-
-	return false;
 }

 /**
@ -332,7 +309,6 @@ void SearchCubePruning::PrintBitmapContainerGraph()
 	{
 		cerr << iterAccessor->first << endl;
 		BitmapContainer &container = *iterAccessor->second;
-
 	}

 }
--- a/moses/src/SearchCubePruning.h
+++ b/moses/src/SearchCubePruning.h
@ -4,7 +4,6 @@
 #include <vector>
 #include "Search.h"
 #include "HypothesisStackCubePruning.h"
-#include "ReorderingConstraint.h"

 namespace Moses
 {
@ -21,7 +20,6 @@ protected:
 	TargetPhrase m_initialTargetPhrase; /**< used to seed 1st hypo */
 	clock_t m_start; /**< used to track time spend on translation */
 	const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
-	ReorderingConstraint *m_reorderingConstraint; /**< positions in input sentence over which no reordering is allowed */

 	//! go thru all bitmaps in 1 stack & create backpointers to bitmaps in the stack
 	void CreateForwardTodos(HypothesisStackCubePruning &stack);
--- a/moses/src/SearchNormal.cpp
+++ b/moses/src/SearchNormal.cpp
@ -34,13 +34,6 @@ SearchNormal::SearchNormal(const InputType &source, const TranslationOptionColle

 		m_hypoStackColl[ind] = sourceHypoColl;
 	}
-
-	// set additional reordering constraints, if specified
-	if (staticData.UseReorderingConstraint())
-	{
-		m_reorderingConstraint = new ReorderingConstraint( m_source.GetSize() );
-		m_reorderingConstraint->SetWall( m_source );
-	}
 }

 SearchNormal::~SearchNormal()
@ -129,7 +122,10 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)

 			for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
 			{
-				if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)))
+				if (!hypoBitmap.Overlap(WordsRange(startPos, endPos)) || !m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
+				{
+				    continue;
+				}
 				{
 					//TODO: does this method include incompatible WordLattice hypotheses?
 					ExpandAllHypotheses(hypothesis
@ -147,7 +143,7 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
 	const size_t	hypoFirstGapPos	= hypoBitmap.GetFirstGapPos()
 		, sourceSize			= m_source.GetSize();

-	// MAIN LOOP. go through each possible hypo
+	// MAIN LOOP. go through each possible range
 	for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos)
 	{
 		size_t maxSize = sourceSize - startPos;
@ -178,6 +174,12 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)

 		for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)
 		{
+			// check if passes specified reordering constraints
+			// (set with -monotone-at-punctuation or xml)
+			if (!m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) )
+			{
+				continue;
+			}
 			// check for overlap
 			WordsRange extRange(startPos, endPos);
 #ifdef DEBUGLATTICE
@ -268,14 +270,8 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
 					continue;
 				}

-				// if reordering walls are used (--monotone-at-punctuation), check here if 
-				// there is a wall between the beginning of the gap and the end
-				// of this new phrase (jumping the wall). 
-				if ( StaticData::Instance().UseReorderingConstraint() ) {
-				  if ( m_reorderingConstraint->ContainsWall( hypoFirstGapPos, endPos ) )
-				    continue;
-				}

+				// everything is fine, we're good to go
 				ExpandAllHypotheses(hypothesis
 						    ,m_transOptColl.GetTranslationOptionList(extRange));

--- a/moses/src/SearchNormal.h
+++ b/moses/src/SearchNormal.h
@ -5,7 +5,6 @@
 #include "Search.h"
 #include "HypothesisStackNormal.h"
 #include "TranslationOptionCollection.h"
-#include "ReorderingConstraint.h"
 #include "Timer.h"

 namespace Moses
@ -25,8 +24,6 @@ protected:
 	size_t interrupted_flag;
 	HypothesisStackNormal* actual_hypoStack; /**actual (full expanded) stack of hypotheses*/ 
 	const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
-	ReorderingConstraint *m_reorderingConstraint; /**< positions in input sentence over which no reordering is allowed */
-

 	// functions for creating hypotheses
 	void ProcessOneHypothesis(const Hypothesis &hypothesis);
--- a/moses/src/Sentence.cpp
+++ b/moses/src/Sentence.cpp
@ -36,16 +36,19 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)

 	if (getline(in, line, '\n').eof())	
 			return 0;
+	// remove extra spaces
 	line = Trim(line);
-  meta = ProcessAndStripSGML(line);

+	// if sentences is specified as "<seg id=1> ... </seg>", extract id
+  meta = ProcessAndStripSGML(line);
 	if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
 	
-	//parse XML markup in translation line
+	// parse XML markup in translation line
 	const StaticData &staticData = StaticData::Instance();
 	std::vector<std::vector<XmlOption*> > xmlOptionsList(0);
+	std::vector< size_t > xmlWalls;
 	if (staticData.GetXmlInputType() != XmlPassThrough) {
-		if (!ProcessAndStripXMLTags(line, xmlOptionsList)) {
+		if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls )) {
 			TRACE_ERR("Unable to parse XML in line " << line);
 			abort();
 		}			
@ -107,6 +110,21 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
 		}
 		
 	}	
+
+	m_reorderingConstraint.InitializeWalls( GetSize() );
+
+	// set reordering walls, if "-monotone-at-punction" is set
+	if (staticData.UseReorderingConstraint())
+	{
+		m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) );
+	}
+
+	// set walls obtained from xml
+	for(size_t i=0; i<xmlWalls.size(); i++)
+		if( xmlWalls[i] < GetSize() ) // no buggy walls, please
+			m_reorderingConstraint.SetWall( xmlWalls[i], true );
+	m_reorderingConstraint.FinalizeWalls();
+
 	return 1;
 }

--- a/moses/src/Sentence.h
+++ b/moses/src/Sentence.h
@ -86,7 +86,6 @@ class Sentence : public Phrase, public InputType
 	//! populates vector argument with XML force translation options for the specific range passed
 	void GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const;

-
 	int Read(std::istream& in,const std::vector<FactorType>& factorOrder);
 	void Print(std::ostream& out) const;

--- a/moses/src/TranslationOptionCollection.cpp
+++ b/moses/src/TranslationOptionCollection.cpp
@ -543,7 +543,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
 	 */
 	bool TranslationOptionCollection::HasXmlOptionsOverlappingRange(size_t, size_t) const {
 		return false;
-	
+		//not implemented for base class
 	}
 	
 	/** Populates the current Collection with XML options exactly covering the range specified. Default implementation does nothing.
--- a/moses/src/WordsBitmap.h
+++ b/moses/src/WordsBitmap.h
@ -196,7 +196,6 @@ public:

        //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
        WordsBitmapID GetID() const {
-                std::cerr << "GetID()\n";
                assert(m_size < (1<<16));

                size_t start = GetFirstGapPos();
--- a/moses/src/XmlOption.cpp
+++ b/moses/src/XmlOption.cpp
@ -32,198 +32,338 @@
 namespace Moses 
 {

-std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
+string ParseXmlTagAttribute(const string& tag,const string& attributeName){
 	/*TODO deal with unescaping \"*/
 	string tagOpen = attributeName + "=\"";
 	size_t contentsStart = tag.find(tagOpen);
-	if (contentsStart == std::string::npos) return "";
+	if (contentsStart == string::npos) return "";
 	contentsStart += tagOpen.size();
 	size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
-	if (contentsEnd == std::string::npos) {
+	if (contentsEnd == string::npos) {
 		TRACE_ERR("Malformed XML attribute: "<< tag);
 		return "";	
 	}
 	size_t possibleEnd;
-	while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
+	while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
 		contentsEnd = possibleEnd;
 	}
 	return tag.substr(contentsStart,contentsEnd-contentsStart);
 }

-std::string TrimXml(const std::string& str) {
+/**
+ * Remove "<" and ">" from XML tag
+ *
+ * \param str xml token to be stripped
+ */
+string TrimXml(const string& str) 
+{
+  // too short to be xml token -> do nothing
 	if (str.size() < 2) return str;
-	if (str[0] == '<' && str[str.size() - 1] == '>') {
+
+  // strip first and last character
+	if (str[0] == '<' && str[str.size() - 1] == '>') 
+	{
 		return str.substr(1, str.size() - 2);
-	} else { return str; }
+	} 
+  // not an xml token -> do nothing
+  else { return str; }
 }

-bool isXmlTag(const std::string& tag)
+/**
+ * Check if the token is an XML tag, i.e. starts with "<"
+ *
+ * \param tag token to be checked
+ */
+bool isXmlTag(const string& tag)
 {
 	return tag[0] == '<';
 }

-inline std::vector<std::string> TokenizeXml(const std::string& str)
+/**
+ * Split up the input character string into tokens made up of 
+ * either XML tags or text.
+ * example: this <b> is a </b> test .
+ *       => (this ), (<b>), ( is a ), (</b>), ( test .)
+ *
+ * \param str input string
+ */
+inline vector<string> TokenizeXml(const string& str)
 {
-	std::string lbrack = "<";
-	std::string rbrack = ">";
-	std::vector<std::string> tokens;
-	// Find first "non-delimiter".
-	std::string::size_type cpos = 0;
-	std::string::size_type lpos = 0;
-	std::string::size_type rpos = 0;
+	string lbrack = "<";
+	string rbrack = ">";
+	vector<string> tokens; // vector of tokens to be returned
+	string::size_type cpos = 0; // current position in string
+	string::size_type lpos = 0; // left start of xml tag
+	string::size_type rpos = 0; // right end of xml tag

-	while (cpos != str.size()) {
+  // walk thorugh the string (loop vver cpos)
+	while (cpos != str.size()) 
+	{
+    // find the next opening "<" of an xml tag
  	lpos = str.find_first_of(lbrack, cpos);
-		if (lpos != std::string::npos) {
+		if (lpos != string::npos) 
+		{
+			// find the end of the xml tag
 			rpos = str.find_first_of(rbrack, lpos);
-			if (rpos == std::string::npos) {
+			// sanity check: there has to be closing ">"
+			if (rpos == string::npos) 
+			{
 				TRACE_ERR("ERROR: malformed XML: " << str << endl);
 				return tokens;
 			}
-		} else {
+		} 
+		else // no more tags found
+		{
+			// add the rest as token
 			tokens.push_back(str.substr(cpos));
 			break;
 		}
+
+		// add stuff before xml tag as token, if there is any
 		if (lpos - cpos > 0)
 			tokens.push_back(str.substr(cpos, lpos - cpos));
+		
+		// add xml tag as token
 		tokens.push_back(str.substr(lpos, rpos-lpos+1));
 		cpos = rpos + 1;
 	}
 	return tokens;
 }

+/**
+ * Process a sentence with xml annotation
+ * Xml tags may specifiy additional/replacing translation options
+ * and reordering constraints
+ *
+ * \param line in: sentence, out: sentence without the xml
+ * \param res vector with translation options specified by xml
+ * \param reorderingConstraint reordering constraint zones specified by xml
+ * \param walls reordering constraint walls specified by xml
+ */
 /*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
 is so we can link things up afterwards. We can't create TranslationOptions as we
 parse because we don't have the completed source parsed until after this function
 removes all the markup from it (CreateFromString in Sentence::Read).
 */
-bool ProcessAndStripXMLTags(std::string &line, std::vector<std::vector<XmlOption*> > &res) {
+bool ProcessAndStripXMLTags(string &line, vector<vector<XmlOption*> > &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls ) {
 	//parse XML markup in translation line
 	
-	if (line.find_first_of('<') == std::string::npos) { return true; }
+	// no xml tag? we're done.
+	if (line.find_first_of('<') == string::npos) { return true; }
 	
-	std::string rstr;
-	std::vector<std::string> xmlTokens = TokenizeXml(line);
-	std::string tagName = "";
-	std::string tagContents = "";
-	std::vector<std::string> altTexts;
-	std::vector<std::string> altProbs;
-	std::vector<XmlOption*> linkedOptions;
-	size_t tagStart=0;
-	size_t tagEnd=0;
-	size_t curWord=0;
-	int numUnary = 0;
-	bool doClose = false;
+	// break up input into a vector of xml tags and text
+  // example: (this), (<b>), (is a), (</b>), (test .)
+	vector<string> xmlTokens = TokenizeXml(line);
+
+	// we need to store opened tags, until they are closed
+	// tags are stored as tripled (tagname, startpos, contents)
+	typedef pair< string, pair< size_t, string > > OpenedTag;
+	vector< OpenedTag > tagStack; // stack that contains active opened tags
+
+	string cleanLine; // return string (text without xml)
+	vector<XmlOption*> linkedOptions;
+	size_t wordPos = 0; // position in sentence (in terms of number of words)
 	bool isLinked = false;
-	const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
-	const std::string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+	const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+	const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();

+  // loop through the tokens
 	for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
 	{
+    // not a xml tag, but regular text (may contain many words)
 		if(!isXmlTag(xmlTokens[xmlTokenPos]))
 		{
-			//phrase, not tag. token may contain many words
-			rstr += xmlTokens[xmlTokenPos];
-			curWord = Tokenize(rstr).size();
+			// add a space at boundary, if necessary
+			if (cleanLine.size()>0 &&
+			    cleanLine[cleanLine.size() - 1] != ' ' &&
+			    xmlTokens[xmlTokenPos][0] != ' ')
+			{
+				cleanLine += " ";
 			}
+			cleanLine += xmlTokens[xmlTokenPos]; // add to output
+			wordPos = Tokenize(cleanLine).size(); // count all the words
+		}
+
+		// process xml tag
 		else
 		{
-			//tag data
-			std::string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
+			// *** get essential information about tag ***
+
+      // strip extra boundary spaces and "<" and ">"
+			string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
 			VERBOSE(3,"XML TAG IS: " << tag << std::endl);
-			std::string::size_type endOfName = tag.find_first_of(' ');
-			std::string nextTagName = tag;
-			bool isUnary = tag[tag.size() - 1] == '/';
-			bool isOpen = tag[0] != '/';
-			if (endOfName != std::string::npos) {
-				nextTagName = tag.substr(0,endOfName);
-				tagContents = tag.substr(endOfName+1);
-			}
-			if (nextTagName == "linked") {
-				//just set a flag, don't try to process
-				if (tagName != "") {
-					TRACE_ERR("ERROR: tried to start linked XML tag while \""<< tagName <<"\" tag still open: " << line << endl);
+
+			if (tag.size() == 0)
+			{
+				TRACE_ERR("ERROR: empty tag name: " << line << endl);
 				return false;
 			}
-				if (linkedOptions.size()>0) {
-					TRACE_ERR("ERROR: tried to start second linked XML tag while linked still open: " << line << endl);
+
+      // check if unary (e.g., "<wall/>")
+			bool isUnary = ( tag[tag.size() - 1] == '/' );
+
+			// check if opening tag (e.g. "<a>", not "</a>")g
+			bool isClosed = ( tag[0] == '/' );
+			bool isOpen = !isClosed;
+
+			if (isClosed && isUnary)
+			{
+				TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
+				return false;
+			}
+
+			if (isClosed)
+				tag = tag.substr(1); // remove "/" at the beginning
+			if (isUnary)
+				tag = tag.substr(0,tag.size()-1); // remove "/" at the end
+
+      // find the tag name and contents
+			string::size_type endOfName = tag.find_first_of(' ');
+			string tagName = tag;
+			string tagContent = "";
+			if (endOfName != string::npos) {
+				tagName = tag.substr(0,endOfName);
+				tagContent = tag.substr(endOfName+1);
+			}
+
+			// *** process new tag ***
+
+			if (isOpen || isUnary)
+			{
+				// special case: linked tag turns on linked flag
+				if (tagName == "linked")
+				{
+					if (isLinked)
+					{
+						TRACE_ERR("ERROR: second linked tag opened before first one closed: " << line << endl);
 						return false;
 					}
 					isLinked = true;
-				isOpen = false;
 				}
-			else if (nextTagName == "/linked") {
-				isLinked = false;
-				//can't be in an open tag when we stop linking
-				if (tagName != "") {
-					TRACE_ERR("ERROR: tried to close linked XML tag while \""<< tagName <<"\" tag still open: " << line << endl);
-					return false;
+				// put the tag on the tag stack
+				OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
+				tagStack.push_back( openedTag );
+				VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
 			}
-				res.push_back(linkedOptions);
-				linkedOptions.clear();
-			}
-			else if (isOpen)
+
+			// *** process completed tag ***
+
+			if (isClosed || isUnary)
 			{
-				//this is an open tag
-				tagName = nextTagName;
-				altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||");
-				altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||");
-				std::string span = ParseXmlTagAttribute(tagContents,"span");
-				tagStart = curWord;
-				if (isUnary) {
-					numUnary++;
-					if (span.empty()) {
-						TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl);
+				// pop last opened tag from stack;
+				if (tagStack.size() == 0)
+				{
+					TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
 					return false;
 				}
-					std::vector<std::string> ij = Tokenize(span, ",");
-					if (ij.size() != 2) {
-						TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl);
+				OpenedTag openedTag = tagStack.back();
+				tagStack.pop_back();
+				
+				// tag names have to match
+				if (openedTag.first != tagName)
+				{
+					TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
 					return false;
 				}
-					tagStart = atoi(ij[0].c_str());
-					tagEnd = atoi(ij[1].c_str());
-					if (tagEnd < tagStart) {
-						TRACE_ERR("ERROR: span tag " << span << " invalid" << endl);
+				 
+				// assemble remaining information about tag
+				size_t startPos = openedTag.second.first;
+				string tagContent = openedTag.second.second;
+				size_t endPos = wordPos;
+
+				// span attribute overwrites position
+				string span = ParseXmlTagAttribute(tagContent,"span");
+				if (! span.empty()) 
+				{
+					vector<string> ij = Tokenize(span, ",");
+					if (ij.size() != 1 && ij.size() != 2) {
+						TRACE_ERR("ERROR: span attribute must be of the form \"i,j\" or \"i\": " << line << endl);
 						return false;
 					}
-					doClose = true;
-					VERBOSE(3,"XML TAG IS UNARY" << endl);
+					startPos = atoi(ij[0].c_str());
+					if (ij.size() == 1) endPos = startPos;
+					else endPos = atoi(ij[1].c_str()) + 1;
 				}
+
+				VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
+				// special tag: <linked>
+				if (tagName == "linked")
+				{
+					isLinked = false;
+				}
+
+				// special tag: wall
+				if (tagName == "wall")
+				{
+					size_t start = (startPos == 0) ? 0 : startPos-1;
+					for(size_t pos = start; pos < endPos; pos++)
+						walls.push_back( pos );
+				}
+
+				// special tag: zone
+				else if (tagName == "zone")
+				{
+					if (startPos >= endPos)
+					{
+						TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
+						return false;
+					}
+					reorderingConstraint.SetZone( startPos, endPos-1 );
+				}
+
+				// default: opening tag that specifies translation options
+				else
+				{
+					if (startPos >= endPos)
+					{
+						TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
+						return false;
+					}
+
+					// specified translations -> vector of phrases
+					// multiple translations may be specified, separated by "||"
+					vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
+					if( altTexts.size() == 1 && altTexts[0] == "" )
+						altTexts.pop_back(); // happens when nothing specified
+					// deal with legacy annotations: "translation" was called "english"
+					vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
+					if (moreAltTexts.size()>1 || moreAltTexts[0] != "")
+					{
+						for(vector<string>::iterator translation=moreAltTexts.begin(); 
+					 	   translation != moreAltTexts.end(); 
+								translation++)
+						{
+							string t = *translation;
+							altTexts.push_back( t );
+						}
+					}
+
+					// specified probabilities for the translations -> vector of probs
+					vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
+					if( altProbs.size() == 1 && altProbs[0] == "" )
+						altProbs.pop_back(); // happens when nothing specified
+
+					// report what we have processed so far
 					VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
-				VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
+					VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
 					VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
-				VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl);					
-				if (altTexts.size() != altProbs.size()) {
+					VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);					
+					if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
 						TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
 						return false;
 					}
-			}
-			else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName)) 
-			{
-				//mismatched tag, abort!
-				TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
-				return false;
-			}
-			else {
-				doClose = true;
-				tagEnd = curWord-1; //size is inclusive
-			}
-			if (doClose) {
-				VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
-				VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl);
-				//store translation options into members

+					// store translation options into members
 					if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
+						// only store options if we aren't ignoring them
 						for (size_t i=0; i<altTexts.size(); ++i) {
-						//only store options if we aren't ignoring them
-						//set default probability
+							// set default probability
 							float probValue = 1;
-						if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
-						//Convert from prob to log-prob
+							if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
+							// convert from prob to log-prob
 							float scoreValue = FloorScore(TransformScore(probValue));
 						
-						WordsRange range(tagStart,tagEnd);
+							WordsRange range(startPos,endPos-1); // span covered by phrase
 							TargetPhrase targetPhrase(Output);
 							targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);
 							targetPhrase.SetScore(scoreValue);
@ -231,26 +371,35 @@ bool ProcessAndStripXMLTags(std::string &line, std::vector<std::vector<XmlOption
 							XmlOption *option = new XmlOption(range,targetPhrase);
 							assert(option);
 						
-						if (isLinked) {
-							//puch all linked items as one column in our list of xmloptions
+							if (isLinked) 
+							{
+								// push all linked items as one column in our list of xmloptions
 								linkedOptions.push_back(option);
-						} else {
-							//push one-item list (not linked to anything)
-							std::vector<XmlOption*> optList(0);
+							} 
+							else 
+							{
+								// push one-item list (not linked to anything)
+								vector<XmlOption*> optList(0);
 								optList.push_back(option);
 								res.push_back(optList);
 							}
 						}
-				}
-				tagName= "";
-				tagContents = "";
 						altTexts.clear();
 						altProbs.clear();
-				doClose = false;
 					}
 				}
 			}
-	line = rstr;
+		}
+	}
+	// we are done. check if there are tags that are still open
+	if (tagStack.size() > 0)
+	{
+		TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
+		return false;
+	}
+
+	// return de-xml'ed sentence in line
+	line = cleanLine;
 	return true;
 }

--- a/moses/src/XmlOption.h
+++ b/moses/src/XmlOption.h
@ -4,6 +4,7 @@
 #include <string>
 #include "WordsRange.h"
 #include "TargetPhrase.h"
+#include "ReorderingConstraint.h"

 namespace Moses
 {
@ -22,7 +23,7 @@ struct XmlOption {

 };

-bool ProcessAndStripXMLTags(std::string &line,std::vector<std::vector<XmlOption*> > &res);
+bool ProcessAndStripXMLTags(std::string &line,std::vector<std::vector<XmlOption*> > &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls );

 }