Roll out mixed syntax

2024-12-26 05:14:36 +03:00 · 2014-08-29 15:31:47 +01:00 · 2014-08-29 15:31:47 +01:00 · 73f1d259a1
commit 73f1d259a1
parent be9b3cb1c6
35 changed files with 13200 additions and 0 deletions
--- a/phrase-extract/extract-mixed-syntax/AlignedSentence.cpp
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentence.cpp
@ -0,0 +1,194 @@
+/*
+ * AlignedSentence.cpp
+ *
+ *  Created on: 18 Feb 2014
+ *      Author: s0565741
+ */
+
+#include <sstream>
+#include "moses/Util.h"
+#include "AlignedSentence.h"
+#include "Parameter.h"
+
+using namespace std;
+
+
+/////////////////////////////////////////////////////////////////////////////////
+AlignedSentence::AlignedSentence(int lineNum,
+			const std::string &source,
+			const std::string &target,
+			const std::string &alignment)
+:m_lineNum(lineNum)
+{
+	PopulateWordVec(m_source, source);
+	PopulateWordVec(m_target, target);
+	PopulateAlignment(alignment);
+}
+
+AlignedSentence::~AlignedSentence() {
+	Moses::RemoveAllInColl(m_source);
+	Moses::RemoveAllInColl(m_target);
+}
+
+void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line)
+{
+	std::vector<string> toks;
+	Moses::Tokenize(toks, line);
+
+	vec.resize(toks.size());
+	for (size_t i = 0; i < vec.size(); ++i) {
+		const string &tok = toks[i];
+		Word *word = new Word(i, tok);
+		vec[i] = word;
+	}
+}
+
+void AlignedSentence::PopulateAlignment(const std::string &line)
+{
+	vector<string> alignStr;
+	Moses::Tokenize(alignStr, line);
+
+	for (size_t i = 0; i < alignStr.size(); ++i) {
+		vector<int> alignPair;
+		Moses::Tokenize(alignPair, alignStr[i], "-");
+		assert(alignPair.size() == 2);
+
+		int sourcePos = alignPair[0];
+		int targetPos = alignPair[1];
+
+		if (sourcePos >= m_source.size()) {
+		  cerr << "ERROR1:AlignedSentence=" << Debug() << endl;
+		  cerr << "m_source=" << m_source.size() << endl;
+		  abort();
+		}
+		assert(sourcePos < m_source.size());
+		assert(targetPos < m_target.size());
+		Word *sourceWord = m_source[sourcePos];
+		Word *targetWord = m_target[targetPos];
+
+		sourceWord->AddAlignment(targetWord);
+		targetWord->AddAlignment(sourceWord);
+	}
+}
+
+std::string AlignedSentence::Debug() const
+{
+  stringstream out;
+	out << "m_lineNum:";
+	out << m_lineNum;
+	out << endl;
+
+	out << "m_source:";
+	out << m_source.Debug();
+	out << endl;
+
+	out << "m_target:";
+	out << m_target.Debug();
+	out << endl;
+
+	out << "consistent phrases:" << endl;
+	out << m_consistentPhrases.Debug();
+	out << endl;
+
+	return out.str();
+}
+
+std::vector<int> AlignedSentence::GetSourceAlignmentCount() const
+{
+	vector<int> ret(m_source.size());
+
+	for (size_t i = 0; i < m_source.size(); ++i) {
+		const Word &word = *m_source[i];
+		ret[i] = word.GetAlignmentIndex().size();
+	}
+	return ret;
+}
+
+void AlignedSentence::Create(const Parameter &params)
+{
+	CreateConsistentPhrases(params);
+	m_consistentPhrases.AddHieroNonTerms(params);
+}
+
+void AlignedSentence::CreateConsistentPhrases(const Parameter &params)
+{
+  int countT = m_target.size();
+  int countS = m_source.size();
+
+  m_consistentPhrases.Initialize(countS);
+
+  // check alignments for target phrase startT...endT
+  for(int lengthT=1;
+	  lengthT <= params.maxSpan && lengthT <= countT;
+	  lengthT++) {
+	for(int startT=0; startT < countT-(lengthT-1); startT++) {
+
+	  // that's nice to have
+	  int endT = startT + lengthT - 1;
+
+	  // find find aligned source words
+	  // first: find minimum and maximum source word
+	  int minS = 9999;
+	  int maxS = -1;
+	  vector< int > usedS = GetSourceAlignmentCount();
+	  for(int ti=startT; ti<=endT; ti++) {
+		const Word &word = *m_target[ti];
+		const std::set<int> &alignment = word.GetAlignmentIndex();
+
+		std::set<int>::const_iterator iterAlign;
+		for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) {
+		  int si = *iterAlign;
+		  if (si<minS) {
+			minS = si;
+		  }
+		  if (si>maxS) {
+			maxS = si;
+		  }
+		  usedS[ si ]--;
+		}
+	  }
+
+	  // unaligned phrases are not allowed
+	  if( maxS == -1 )
+		continue;
+
+	  // source phrase has to be within limits
+	  size_t width = maxS - minS + 1;
+
+	  if( width < params.minSpan )
+		continue;
+
+	  if( width > params.maxSpan )
+		continue;
+
+	  // check if source words are aligned to out of bound target words
+	  bool out_of_bounds = false;
+	  for(int si=minS; si<=maxS && !out_of_bounds; si++)
+		if (usedS[si]>0) {
+		  out_of_bounds = true;
+		}
+
+	  // if out of bound, you gotta go
+	  if (out_of_bounds)
+		continue;
+
+	  // done with all the checks, lets go over all consistent phrase pairs
+	  // start point of source phrase may retreat over unaligned
+	  for(int startS=minS;
+		  (startS>=0 &&
+		   startS>maxS - params.maxSpan && // within length limit
+		   (startS==minS || m_source[startS]->GetAlignment().size()==0)); // unaligned
+		  startS--) {
+		// end point of source phrase may advance over unaligned
+		for(int endS=maxS;
+			(endS<countS && endS<startS + params.maxSpan && // within length limit
+			 (endS==maxS || m_source[endS]->GetAlignment().size()==0)); // unaligned
+			endS++) {
+
+		  // take note that this is a valid phrase alignment
+		  m_consistentPhrases.Add(startS, endS, startT, endT, params);
+		}
+	  }
+	}
+  }
+}
--- a/phrase-extract/extract-mixed-syntax/AlignedSentence.h
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentence.h
@ -0,0 +1,51 @@
+/*
+ * AlignedSentence.h
+ *
+ *  Created on: 18 Feb 2014
+ *      Author: s0565741
+ */
+#pragma once
+
+#include <string>
+#include <set>
+#include "ConsistentPhrases.h"
+#include "Phrase.h"
+#include "moses/TypeDef.h"
+
+class Parameter;
+
+class AlignedSentence {
+public:
+	AlignedSentence(int lineNum)
+	:m_lineNum(lineNum)
+	{}
+
+	AlignedSentence(int lineNum,
+			const std::string &source,
+			const std::string &target,
+			const std::string &alignment);
+	virtual ~AlignedSentence();
+	virtual void Create(const Parameter &params);
+
+	const Phrase &GetPhrase(Moses::FactorDirection direction) const
+	{ return (direction == Moses::Input) ? m_source : m_target; }
+
+	const ConsistentPhrases &GetConsistentPhrases() const
+	{ return m_consistentPhrases; }
+
+	virtual std::string Debug() const;
+
+	int m_lineNum;
+protected:
+  Phrase m_source, m_target;
+  ConsistentPhrases m_consistentPhrases;
+
+  	void CreateConsistentPhrases(const Parameter &params);
+	void PopulateWordVec(Phrase &vec, const std::string &line);
+
+	// m_source and m_target MUST be populated before calling this
+	void PopulateAlignment(const std::string &line);
+	std::vector<int> GetSourceAlignmentCount() const;
+};
+
+
--- a/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp
@ -0,0 +1,183 @@
+/*
+ * AlignedSentenceSyntax.cpp
+ *
+ *  Created on: 26 Feb 2014
+ *      Author: hieu
+ */
+
+#include "AlignedSentenceSyntax.h"
+#include "Parameter.h"
+#include "pugixml.hpp"
+#include "moses/Util.h"
+
+using namespace std;
+
+AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum,
+		const std::string &source,
+		const std::string &target,
+		const std::string &alignment)
+:AlignedSentence(lineNum)
+,m_sourceStr(source)
+,m_targetStr(target)
+,m_alignmentStr(alignment)
+{
+}
+
+AlignedSentenceSyntax::~AlignedSentenceSyntax() {
+	// TODO Auto-generated destructor stub
+}
+
+void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
+		string line, Phrase &phrase, SyntaxTree &tree)
+{
+	// parse source and target string
+	if (isSyntax) {
+		line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
+		XMLParse(phrase, tree, line, params);
+
+		if (mixedSyntaxType != 0) {
+			// mixed syntax. Always add [X] where there isn't 1
+			tree.SetHieroLabel(params.hieroNonTerm);
+			if (mixedSyntaxType == 2) {
+				tree.AddToAll(params.hieroNonTerm);
+			}
+		}
+	}
+	else {
+		PopulateWordVec(phrase, line);
+		tree.SetHieroLabel(params.hieroNonTerm);
+	}
+
+}
+
+void AlignedSentenceSyntax::Create(const Parameter &params)
+{
+	Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr,
+			m_source, m_sourceTree);
+	Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr,
+			m_target, m_targetTree);
+
+	PopulateAlignment(m_alignmentStr);
+	CreateConsistentPhrases(params);
+
+	// create labels
+	CreateNonTerms();
+}
+
+void Escape(string &text)
+{
+	text = Moses::Replace(text, "&", "&amp;");
+	text = Moses::Replace(text, "|", "&#124;");
+	text = Moses::Replace(text, "<", "&lt;");
+	text = Moses::Replace(text, ">", "&gt;");
+	text = Moses::Replace(text, "'", "&apos;");
+	text = Moses::Replace(text, "\"", "&quot;");
+	text = Moses::Replace(text, "[", "&#91;");
+	text = Moses::Replace(text, "]", "&#93;");
+
+}
+
+void AlignedSentenceSyntax::XMLParse(Phrase &output,
+		SyntaxTree &tree,
+		const pugi::xml_node &parentNode,
+		const Parameter &params)
+{
+	int childNum = 0;
+    for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling())
+    {
+    	string nodeName = childNode.name();
+
+    	// span label
+    	string label;
+        int startPos = output.size();
+
+    	if (!nodeName.empty()) {
+        	pugi::xml_attribute attribute = childNode.attribute("label");
+            label = attribute.as_string();
+
+            // recursively call this function. For proper recursive trees
+			XMLParse(output, tree, childNode, params);
+    	}
+
+
+
+        // fill phrase vector
+    	string text = childNode.value();
+    	Escape(text);
+    	//cerr << childNum << " " << label << "=" << text << endl;
+
+    	std::vector<string> toks;
+    	Moses::Tokenize(toks, text);
+
+    	for (size_t i = 0; i < toks.size(); ++i) {
+    		const string &tok = toks[i];
+    		Word *word = new Word(output.size(), tok);
+    		output.push_back(word);
+    	}
+
+        // is it a labelled span?
+    	int endPos = output.size() - 1;
+
+    	// fill syntax labels
+        if (!label.empty()) {
+        	label = "[" + label + "]";
+        	tree.Add(startPos, endPos, label, params);
+        }
+
+        ++childNum;
+    }
+
+}
+
+void AlignedSentenceSyntax::XMLParse(Phrase &output,
+		SyntaxTree &tree,
+		const std::string input,
+		const Parameter &params)
+{
+	pugi::xml_document doc;
+	pugi::xml_parse_result result = doc.load(input.c_str(),
+			pugi::parse_default | pugi::parse_comments);
+
+	pugi::xml_node topNode = doc.child("xml");
+	XMLParse(output, tree, topNode, params);
+}
+
+void AlignedSentenceSyntax::CreateNonTerms()
+{
+	for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) {
+		for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) {
+			ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd);
+			const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd);
+
+			ConsistentPhrases::Coll::iterator iter;
+			for (iter = coll.begin(); iter != coll.end(); ++iter) {
+				ConsistentPhrase &cp = **iter;
+
+				int targetStart = cp.corners[2];
+				int targetEnd = cp.corners[3];
+				const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd);
+
+				CreateNonTerms(cp, sourceLabels, targetLabels);
+			}
+		}
+	}
+
+}
+
+void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp,
+		const SyntaxTree::Labels &sourceLabels,
+		const SyntaxTree::Labels &targetLabels)
+{
+	SyntaxTree::Labels::const_iterator iterSource;
+	for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) {
+		const string &sourceLabel = *iterSource;
+
+		SyntaxTree::Labels::const_iterator iterTarget;
+		for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) {
+			const string &targetLabel = *iterTarget;
+			cp.AddNonTerms(sourceLabel, targetLabel);
+		}
+	}
+}
+
+
--- a/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h
+++ b/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h
@ -0,0 +1,46 @@
+/*
+ * AlignedSentenceSyntax.h
+ *
+ *  Created on: 26 Feb 2014
+ *      Author: hieu
+ */
+
+#pragma once
+
+#include "AlignedSentence.h"
+#include "SyntaxTree.h"
+#include "pugixml.hpp"
+
+class AlignedSentenceSyntax : public AlignedSentence
+{
+public:
+	AlignedSentenceSyntax(int lineNum,
+			const std::string &source,
+			const std::string &target,
+			const std::string &alignment);
+	virtual ~AlignedSentenceSyntax();
+
+	void Create(const Parameter &params);
+
+	//virtual std::string Debug() const;
+protected:
+	std::string m_sourceStr, m_targetStr, m_alignmentStr;
+	SyntaxTree m_sourceTree, m_targetTree;
+
+	void XMLParse(Phrase &output,
+			SyntaxTree &tree,
+			const std::string input,
+			const Parameter &params);
+	void XMLParse(Phrase &output,
+			SyntaxTree &tree,
+			const pugi::xml_node &parentNode,
+			const Parameter &params);
+	void CreateNonTerms();
+	void CreateNonTerms(ConsistentPhrase &cp,
+			const SyntaxTree::Labels &sourceLabels,
+			const SyntaxTree::Labels &targetLabels);
+	void Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
+			std::string line, Phrase &phrase, SyntaxTree &tree);
+
+};
+
--- a/phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp
@ -0,0 +1,66 @@
+/*
+ * ConsistentPhrase.cpp
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+#include <sstream>
+#include "ConsistentPhrase.h"
+#include "Word.h"
+#include "NonTerm.h"
+#include "Parameter.h"
+
+using namespace std;
+
+ConsistentPhrase::ConsistentPhrase(
+		int sourceStart, int sourceEnd,
+		int targetStart, int targetEnd,
+		const Parameter &params)
+:corners(4)
+,m_hieroNonTerm(*this, params.hieroNonTerm, params.hieroNonTerm)
+{
+	corners[0] = sourceStart;
+	corners[1] = sourceEnd;
+	corners[2] = targetStart;
+	corners[3] = targetEnd;
+}
+
+ConsistentPhrase::~ConsistentPhrase() {
+	// TODO Auto-generated destructor stub
+}
+
+bool ConsistentPhrase::operator<(const ConsistentPhrase &other) const
+{
+  return corners < other.corners;
+}
+
+void ConsistentPhrase::AddNonTerms(const std::string &source,
+					const std::string &target)
+{
+	m_nonTerms.push_back(NonTerm(*this, source, target));
+}
+
+bool ConsistentPhrase::TargetOverlap(const ConsistentPhrase &other) const
+{
+	if ( other.corners[3] < corners[2] || other.corners[2] > corners[3])
+		return false;
+
+	return true;
+}
+
+std::string ConsistentPhrase::Debug() const
+{
+  stringstream out;
+  out << "[" << corners[0] << "-" << corners[1]
+		  << "][" << corners[2] << "-" << corners[3] << "]";
+
+  out << "NT:";
+  for (size_t i = 0; i < m_nonTerms.size(); ++i) {
+	  const NonTerm &nonTerm = m_nonTerms[i];
+	  out << nonTerm.GetLabel(Moses::Input) << ":" << nonTerm.GetLabel(Moses::Output);
+  }
+
+  return out.str();
+}
+
+
--- a/phrase-extract/extract-mixed-syntax/ConsistentPhrase.h
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrase.h
@ -0,0 +1,51 @@
+/*
+ * ConsistentPhrase.h
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+#include <iostream>
+#include "moses/TypeDef.h"
+#include "NonTerm.h"
+
+class ConsistentPhrase
+{
+public:
+	typedef std::vector<NonTerm> NonTerms;
+
+	std::vector<int> corners;
+
+	ConsistentPhrase(const ConsistentPhrase &copy); // do not implement
+	ConsistentPhrase(int sourceStart, int sourceEnd,
+			int targetStart, int targetEnd,
+			const Parameter &params);
+
+	virtual ~ConsistentPhrase();
+
+	int GetWidth(Moses::FactorDirection direction) const
+	{ return (direction == Moses::Input) ? corners[1] - corners[0] + 1 : corners[3] - corners[2] + 1; }
+
+
+	void AddNonTerms(const std::string &source,
+						const std::string &target);
+	const NonTerms &GetNonTerms() const
+	{ return m_nonTerms;}
+	const NonTerm &GetHieroNonTerm() const
+	{ return m_hieroNonTerm;}
+
+	bool TargetOverlap(const ConsistentPhrase &other) const;
+
+  bool operator<(const ConsistentPhrase &other) const;
+
+  std::string Debug() const;
+
+protected:
+  NonTerms m_nonTerms;
+  NonTerm m_hieroNonTerm;
+};
+
--- a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
@ -0,0 +1,103 @@
+/*
+ * ConsistentPhrases.cpp
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+#include <sstream>
+#include <cassert>
+#include "ConsistentPhrases.h"
+#include "NonTerm.h"
+#include "Parameter.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+ConsistentPhrases::ConsistentPhrases()
+{
+}
+
+ConsistentPhrases::~ConsistentPhrases() {
+	for (int start = 0; start < m_coll.size(); ++start) {
+		std::vector<Coll> &allSourceStart = m_coll[start];
+
+		for (int size = 0; size < allSourceStart.size(); ++size) {
+			Coll &coll = allSourceStart[size];
+			Moses::RemoveAllInColl(coll);
+		}
+	}
+}
+
+void ConsistentPhrases::Initialize(size_t size)
+{
+	m_coll.resize(size);
+
+	for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
+		std::vector<Coll> &allSourceStart = m_coll[sourceStart];
+		allSourceStart.resize(size - sourceStart);
+	}
+}
+
+void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
+		int targetStart, int targetEnd,
+		const Parameter &params)
+{
+  Coll &coll = m_coll[sourceStart][sourceEnd - sourceStart];
+  ConsistentPhrase *cp = new ConsistentPhrase(sourceStart, sourceEnd,
+											targetStart, targetEnd,
+											params);
+
+  pair<Coll::iterator, bool> inserted = coll.insert(cp);
+  assert(inserted.second);
+}
+
+const ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd) const
+{
+	const std::vector<Coll> &allSourceStart = m_coll[sourceStart];
+	const Coll &ret = allSourceStart[sourceEnd - sourceStart];
+	return ret;
+}
+
+ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd)
+{
+	std::vector<Coll> &allSourceStart = m_coll[sourceStart];
+	Coll &ret = allSourceStart[sourceEnd - sourceStart];
+	return ret;
+}
+
+std::string ConsistentPhrases::Debug() const
+{
+	std::stringstream out;
+	for (int start = 0; start < m_coll.size(); ++start) {
+		const std::vector<Coll> &allSourceStart = m_coll[start];
+
+		for (int size = 0; size < allSourceStart.size(); ++size) {
+			const Coll &coll = allSourceStart[size];
+
+			Coll::const_iterator iter;
+			for (iter = coll.begin(); iter != coll.end(); ++iter) {
+				const ConsistentPhrase &consistentPhrase = **iter;
+				out << consistentPhrase.Debug() << endl;
+			}
+		}
+	}
+
+	return out.str();
+}
+
+void ConsistentPhrases::AddHieroNonTerms(const Parameter &params)
+{
+	// add [X] labels everywhere
+	for (int i = 0; i < m_coll.size(); ++i) {
+		vector<Coll> &inner = m_coll[i];
+		for (int j = 0; j < inner.size(); ++j) {
+			ConsistentPhrases::Coll &coll = inner[j];
+			ConsistentPhrases::Coll::iterator iter;
+			for (iter = coll.begin(); iter != coll.end(); ++iter) {
+				ConsistentPhrase &cp = **iter;
+				cp.AddNonTerms(params.hieroNonTerm, params.hieroNonTerm);
+			}
+		}
+	}
+}
+
--- a/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h
+++ b/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h
@ -0,0 +1,40 @@
+/*
+ * ConsistentPhrases.h
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+#pragma once
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include "ConsistentPhrase.h"
+
+class Word;
+class Parameter;
+
+class ConsistentPhrases {
+public:
+	typedef std::set<ConsistentPhrase*> Coll;
+
+	ConsistentPhrases();
+	virtual ~ConsistentPhrases();
+
+	void Initialize(size_t size);
+
+	void Add(int sourceStart, int sourceEnd,
+			int targetStart, int targetEnd,
+			const Parameter &params);
+
+	void AddHieroNonTerms(const Parameter &params);
+
+	const Coll &GetColl(int sourceStart, int sourceEnd) const;
+	Coll &GetColl(int sourceStart, int sourceEnd);
+
+	std::string Debug() const;
+
+protected:
+	std::vector< std::vector<Coll> > m_coll;
+};
+
--- a/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
+++ b/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
@ -0,0 +1,62 @@
+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+
+using namespace std;
+
+namespace Moses
+{
+	InputFileStream::InputFileStream(const std::string &filePath)
+	: std::istream(NULL)
+	, m_streambuf(NULL)
+	{
+		if (filePath.size() > 3 &&
+				filePath.substr(filePath.size() - 3, 3) == ".gz")
+		{
+			m_streambuf = new gzfilebuf(filePath.c_str());
+		} else {
+			std::filebuf* fb = new std::filebuf();
+			fb = fb->open(filePath.c_str(), std::ios::in);
+			if (! fb) {
+				cerr << "Can't read " << filePath.c_str() << endl;
+				exit(1);
+			}
+			m_streambuf = fb;
+		}
+		this->init(m_streambuf);
+	}
+	
+	InputFileStream::~InputFileStream()
+	{
+		delete m_streambuf;
+		m_streambuf = NULL;
+	}
+	
+	void InputFileStream::Close()
+	{
+	}
+	
+	
+}
+
--- a/phrase-extract/extract-mixed-syntax/InputFileStream.h
+++ b/phrase-extract/extract-mixed-syntax/InputFileStream.h
@ -0,0 +1,48 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace Moses
+{
+	
+	/** Used in place of std::istream, can read zipped files if it ends in .gz
+	 */
+	class InputFileStream : public std::istream
+	{
+	protected:
+		std::streambuf *m_streambuf;
+	public:
+		
+		InputFileStream(const std::string &filePath);
+		~InputFileStream();
+		
+		void Close();
+	};
+	
+}
+
+#endif
--- a/phrase-extract/extract-mixed-syntax/Main.cpp
+++ b/phrase-extract/extract-mixed-syntax/Main.cpp
@ -0,0 +1,203 @@
+#include <iostream>
+#include <cstdlib>
+#include <boost/program_options.hpp>
+
+#include "Main.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "AlignedSentence.h"
+#include "AlignedSentenceSyntax.h"
+#include "Parameter.h"
+#include "Rules.h"
+
+using namespace std;
+
+bool g_debug = false;
+
+int main(int argc, char** argv)
+{
+  cerr << "Starting" << endl;
+
+  Parameter params;
+
+  namespace po = boost::program_options;
+  po::options_description desc("Options");
+  desc.add_options()
+    ("help", "Print help messages")
+    ("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source")
+    ("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.")
+    ("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here")
+    ("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used")
+    ("GZOutput", "Compress extract files")
+    ("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule")
+    ("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint")
+    ("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.")
+
+    ("SourceSyntax", "Source sentence is a parse tree")
+    ("TargetSyntax", "Target sentence is a parse tree")
+    ("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
+    ("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most")
+    ("HieroSourceLHS", "Always use Hiero source LHS? Default = 0")
+    ("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)")
+    ("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals")
+    ("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH")
+    ("MinScope", po::value<int>()->default_value(params.minScope), "min scope.")
+
+    ("SpanLength", "Property - span length of RHS each non-term")
+
+    ("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ")
+    ("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term")
+    ("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.")
+
+    ("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.")
+    ("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.")
+
+    ("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket")
+    ("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...")
+
+    ("NonTermConsecSource", "Allow consecutive non-terms on the source side");
+
+
+  po::variables_map vm;
+  try
+  {
+    po::store(po::parse_command_line(argc, argv, desc),
+              vm); // can throw
+
+    /** --help option
+     */
+    if ( vm.count("help") || argc < 5 )
+    {
+      std::cout << argv[0] << " target source alignment [options...]" << std::endl
+                << desc << std::endl;
+      return EXIT_SUCCESS;
+    }
+
+    po::notify(vm); // throws on error, so do after help in case
+                    // there are any problems
+  }
+  catch(po::error& e)
+  {
+    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+    std::cerr << desc << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  if (vm.count("MaxSpan")) params.maxSpan = vm["MaxSpan"].as<int>();
+  if (vm.count("MinSpan")) params.minSpan = vm["MinSpan"].as<int>();
+  if (vm.count("GZOutput")) params.gzOutput = true;
+  if (vm.count("GlueGrammar")) params.gluePath = vm["GlueGrammar"].as<string>();
+  if (vm.count("SentenceOffset")) params.sentenceOffset = vm["SentenceOffset"].as<long>();
+  if (vm.count("MaxNonTerm")) params.maxNonTerm = vm["MaxNonTerm"].as<int>();
+  if (vm.count("MaxHieroNonTerm")) params.maxHieroNonTerm = vm["MaxHieroNonTerm"].as<int>();
+  if (vm.count("MinHoleSource")) params.minHoleSource = vm["MinHoleSource"].as<int>();
+
+  if (vm.count("SourceSyntax")) params.sourceSyntax = true;
+  if (vm.count("TargetSyntax")) params.targetSyntax = true;
+  if (vm.count("MixedSyntaxType")) params.mixedSyntaxType = vm["MixedSyntaxType"].as<int>();
+  if (vm.count("MultiLabel")) params.multiLabel = vm["MultiLabel"].as<int>();
+  if (vm.count("HieroSourceLHS")) params.hieroSourceLHS = true;
+  if (vm.count("MaxSpanFreeNonTermSource")) params.maxSpanFreeNonTermSource = vm["MaxSpanFreeNonTermSource"].as<int>();
+  if (vm.count("NoNieceTerminal")) params.nieceTerminal = false;
+  if (vm.count("MaxScope")) params.maxScope = vm["MaxScope"].as<int>();
+  if (vm.count("MinScope")) params.minScope = vm["MinScope"].as<int>();
+
+  // properties
+  if (vm.count("SpanLength")) params.spanLength = true;
+  if (vm.count("NonTermContext")) params.nonTermContext = true;
+  if (vm.count("NonTermContextTarget")) params.nonTermContextTarget = true;
+  if (vm.count("NonTermContextFactor")) params.nonTermContextFactor = vm["NonTermContextFactor"].as<int>();
+
+  if (vm.count("NumSourceFactors")) params.numSourceFactors = vm["NumSourceFactors"].as<int>();
+  if (vm.count("NumTargetFactors")) params.numTargetFactors = vm["NumTargetFactors"].as<int>();
+
+  if (vm.count("HieroNonTerm")) params.hieroNonTerm = vm["HieroNonTerm"].as<string>();
+  if (vm.count("ScopeSpan")) {
+	  params.SetScopeSpan(vm["ScopeSpan"].as<string>());
+  }
+
+  if (vm.count("NonTermConsecSource")) params.nonTermConsecSource = true;
+
+  // input files;
+  string pathTarget = argv[1];
+  string pathSource = argv[2];
+  string pathAlignment = argv[3];
+
+  string pathExtract = argv[4];
+  string pathExtractInv = pathExtract + ".inv";
+  if (params.gzOutput) {
+	  pathExtract += ".gz";
+	  pathExtractInv += ".gz";
+  }
+
+  Moses::InputFileStream strmTarget(pathTarget);
+  Moses::InputFileStream strmSource(pathSource);
+  Moses::InputFileStream strmAlignment(pathAlignment);
+  Moses::OutputFileStream extractFile(pathExtract);
+  Moses::OutputFileStream extractInvFile(pathExtractInv);
+
+
+  // MAIN LOOP
+  int lineNum = 1;
+  string lineTarget, lineSource, lineAlignment;
+  while (getline(strmTarget, lineTarget)) {
+	  if (lineNum % 10000 == 0) {
+		  cerr << lineNum << " ";
+	  }
+
+	  bool success;
+	  success = getline(strmSource, lineSource);
+	  if (!success) {
+		  throw "Couldn't read source";
+	  }
+	  success = getline(strmAlignment, lineAlignment);
+	  if (!success) {
+		  throw "Couldn't read alignment";
+	  }
+
+	  /*
+	  cerr << "lineTarget=" << lineTarget << endl;
+	  cerr << "lineSource=" << lineSource << endl;
+	  cerr << "lineAlignment=" << lineAlignment << endl;
+	  */
+
+	  AlignedSentence *alignedSentence;
+
+	  if (params.sourceSyntax || params.targetSyntax) {
+		  alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment);
+	  }
+	  else {
+		  alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment);
+	  }
+
+	  alignedSentence->Create(params);
+	  //cerr << alignedSentence->Debug();
+
+	  Rules rules(*alignedSentence);
+	  rules.Extend(params);
+	  rules.Consolidate(params);
+	  //cerr << rules.Debug();
+
+	  rules.Output(extractFile, true, params);
+	  rules.Output(extractInvFile, false, params);
+
+	  delete alignedSentence;
+
+	  ++lineNum;
+  }
+
+  if (!params.gluePath.empty()) {
+	  Moses::OutputFileStream glueFile(params.gluePath);
+	  CreateGlueGrammar(glueFile);
+  }
+
+  cerr << "Finished" << endl;
+}
+
+void CreateGlueGrammar(Moses::OutputFileStream &glueFile)
+{
+	glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
+			<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+			<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
+
+}
--- a/phrase-extract/extract-mixed-syntax/Main.h
+++ b/phrase-extract/extract-mixed-syntax/Main.h
@ -0,0 +1,12 @@
+/*
+ * Main.h
+ *
+ *  Created on: 28 Feb 2014
+ *      Author: hieu
+ */
+#pragma once
+
+#include "OutputFileStream.h"
+
+void CreateGlueGrammar(Moses::OutputFileStream &glueFile);
+
--- a/phrase-extract/extract-mixed-syntax/Makefile
+++ b/phrase-extract/extract-mixed-syntax/Makefile
@ -0,0 +1,17 @@
+all: extract-mixed-syntax 
+
+clean: 
+	rm -f *.o extract-mixed-syntax
+
+.cpp.o:
+	g++ -O4 -g -c -I../../../boost/include -I../../../ $<
+
+OBJECTS = AlignedSentence.o ConsistentPhrase.o ConsistentPhrases.o InputFileStream.o \
+	Main.o OutputFileStream.o Parameter.o Phrase.o Rule.o Rules.o RuleSymbol.o \
+	SyntaxTree.o Word.o NonTerm.o RulePhrase.o AlignedSentenceSyntax.o pugixml.o
+
+extract-mixed-syntax: $(OBJECTS) 
+
+	g++ $(OBJECTS)   -L../../../boost/lib64 -L../../../lib -lz -lboost_iostreams-mt -lboost_program_options-mt -lmoses -o extract-mixed-syntax
+
+
--- a/phrase-extract/extract-mixed-syntax/NonTerm.cpp
+++ b/phrase-extract/extract-mixed-syntax/NonTerm.cpp
@ -0,0 +1,65 @@
+/*
+ * NonTerm.cpp
+ *
+ *  Created on: 22 Feb 2014
+ *      Author: hieu
+ */
+
+#include <sstream>
+#include "NonTerm.h"
+#include "Word.h"
+#include "ConsistentPhrase.h"
+#include "Parameter.h"
+
+using namespace std;
+
+NonTerm::NonTerm(const ConsistentPhrase &consistentPhrase,
+				const std::string &source,
+				const std::string &target)
+:m_consistentPhrase(&consistentPhrase)
+,m_source(source)
+,m_target(target)
+{
+	// TODO Auto-generated constructor stub
+
+}
+
+NonTerm::~NonTerm() {
+	// TODO Auto-generated destructor stub
+}
+
+std::string NonTerm::Debug() const
+{
+  stringstream out;
+  out << m_source << m_target;
+  out << m_consistentPhrase->Debug();
+  return out.str();
+}
+
+void NonTerm::Output(std::ostream &out) const
+{
+  out << m_source << m_target;
+}
+
+void NonTerm::Output(std::ostream &out, Moses::FactorDirection direction) const
+{
+	out << GetLabel(direction);
+}
+
+const std::string &NonTerm::GetLabel(Moses::FactorDirection direction) const
+{
+  return (direction == Moses::Input) ? m_source : m_target;
+}
+
+bool NonTerm::IsHiero(Moses::FactorDirection direction, const Parameter &params) const
+{
+	const std::string &label = NonTerm::GetLabel(direction);
+	return label == params.hieroNonTerm;
+}
+
+bool NonTerm::IsHiero(const Parameter &params) const
+{
+ return IsHiero(Moses::Input, params) && IsHiero(Moses::Output, params);
+}
+int NonTerm::GetWidth(Moses::FactorDirection direction) const
+{ return GetConsistentPhrase().GetWidth(direction); }
--- a/phrase-extract/extract-mixed-syntax/NonTerm.h
+++ b/phrase-extract/extract-mixed-syntax/NonTerm.h
@ -0,0 +1,47 @@
+/*
+ * NonTerm.h
+ *
+ *  Created on: 22 Feb 2014
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include "RuleSymbol.h"
+#include "moses/TypeDef.h"
+
+class ConsistentPhrase;
+class Parameter;
+
+class NonTerm : public RuleSymbol
+{
+public:
+
+	NonTerm(const ConsistentPhrase &consistentPhrase,
+			const std::string &source,
+			const std::string &target);
+	virtual ~NonTerm();
+
+	const ConsistentPhrase &GetConsistentPhrase() const
+	{ return *m_consistentPhrase; }
+
+	int GetWidth(Moses::FactorDirection direction) const;
+
+	virtual bool IsNonTerm() const
+	{ return true; }
+
+	std::string GetString() const
+	{ return m_source + m_target; }
+
+	virtual std::string Debug() const;
+	virtual void Output(std::ostream &out) const;
+  void Output(std::ostream &out, Moses::FactorDirection direction) const;
+
+  const std::string &GetLabel(Moses::FactorDirection direction) const;
+  bool IsHiero(Moses::FactorDirection direction, const Parameter &params) const;
+  bool IsHiero(const Parameter &params) const;
+
+protected:
+	const ConsistentPhrase *m_consistentPhrase;
+	std::string m_source, m_target;
+};
+
--- a/phrase-extract/extract-mixed-syntax/OutputFileStream.cpp
+++ b/phrase-extract/extract-mixed-syntax/OutputFileStream.cpp
@ -0,0 +1,79 @@
+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+
+using namespace std;
+
+namespace Moses
+{
+OutputFileStream::OutputFileStream()
+  :boost::iostreams::filtering_ostream()
+  ,m_outFile(NULL)
+{
+}
+
+OutputFileStream::OutputFileStream(const std::string &filePath)
+  : m_outFile(NULL)
+{
+  Open(filePath);
+}
+
+OutputFileStream::~OutputFileStream()
+{
+  Close();
+}
+
+bool OutputFileStream::Open(const std::string &filePath)
+{
+  m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+  if (m_outFile->fail()) {
+    return false;
+  }
+
+  if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+    this->push(boost::iostreams::gzip_compressor());
+  }
+  this->push(*m_outFile);
+
+  return true;
+}
+
+void OutputFileStream::Close()
+{
+  if (m_outFile == NULL) {
+    return;
+  }
+
+  this->flush();
+  this->pop(); // file
+
+  m_outFile->close();
+  delete m_outFile;
+  m_outFile = NULL;
+  return;
+}
+
+
+}
+
--- a/phrase-extract/extract-mixed-syntax/OutputFileStream.h
+++ b/phrase-extract/extract-mixed-syntax/OutputFileStream.h
@ -0,0 +1,50 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+
+namespace Moses
+{
+
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class OutputFileStream : public boost::iostreams::filtering_ostream
+{
+protected:
+  std::ofstream *m_outFile;
+public:
+  OutputFileStream();
+
+  OutputFileStream(const std::string &filePath);
+  virtual ~OutputFileStream();
+
+  bool Open(const std::string &filePath);
+  void Close();
+};
+
+}
+
--- a/phrase-extract/extract-mixed-syntax/Parameter.cpp
+++ b/phrase-extract/extract-mixed-syntax/Parameter.cpp
@ -0,0 +1,69 @@
+/*
+ * Parameter.cpp
+ *
+ *  Created on: 17 Feb 2014
+ *      Author: hieu
+ */
+#include "Parameter.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+
+using namespace std;
+
+Parameter::Parameter()
+:maxSpan(10)
+,minSpan(0)
+,maxNonTerm(2)
+,maxHieroNonTerm(999)
+,maxSymbolsTarget(999)
+,maxSymbolsSource(5)
+,minHoleSource(2)
+,sentenceOffset(0)
+,nonTermConsecSource(false)
+,requireAlignedWord(true)
+,fractionalCounting(true)
+,gzOutput(false)
+
+,hieroNonTerm("[X]")
+,sourceSyntax(false)
+,targetSyntax(false)
+
+,mixedSyntaxType(0)
+,multiLabel(0)
+,nonTermConsecSourceMixed(true)
+,hieroSourceLHS(false)
+,maxSpanFreeNonTermSource(0)
+,nieceTerminal(true)
+,maxScope(UNDEFINED)
+,minScope(0)
+
+,spanLength(false)
+,nonTermContext(false)
+,nonTermContextTarget(false)
+,nonTermContextFactor(0)
+
+,numSourceFactors(1)
+,numTargetFactors(1)
+{}
+
+Parameter::~Parameter() {
+	// TODO Auto-generated destructor stub
+}
+
+void Parameter::SetScopeSpan(const std::string &str)
+{
+	scopeSpanStr = str;
+	vector<string> toks1;
+	Moses::Tokenize(toks1, str, ":");
+
+	for (size_t i = 0; i < toks1.size(); ++i) {
+		const string &tok1 = toks1[i];
+
+		vector<int> toks2;
+		Moses::Tokenize<int>(toks2, tok1, ",");
+		UTIL_THROW_IF2(toks2.size() != 2, "Format is min,max:min,max... String is " << tok1);
+
+		std::pair<int,int> values(toks2[0], toks2[1]);
+		scopeSpan.push_back(values);
+	}
+}
--- a/phrase-extract/extract-mixed-syntax/Parameter.h
+++ b/phrase-extract/extract-mixed-syntax/Parameter.h
@ -0,0 +1,62 @@
+/*
+ * Parameter.h
+ *
+ *  Created on: 17 Feb 2014
+ *      Author: hieu
+ */
+#pragma once
+
+#include <string>
+#include <limits>
+#include <vector>
+
+#define UNDEFINED	std::numeric_limits<int>::max()
+
+class Parameter
+{
+public:
+  Parameter();
+  virtual ~Parameter();
+
+  int maxSpan;
+  int minSpan;
+  int maxNonTerm;
+  int maxHieroNonTerm;
+  int maxSymbolsTarget;
+  int maxSymbolsSource;
+  int minHoleSource;
+
+  long sentenceOffset;
+
+  bool nonTermConsecSource;
+  bool requireAlignedWord;
+  bool fractionalCounting;
+  bool gzOutput;
+
+  std::string hieroNonTerm;
+  std::string gluePath;
+
+  bool sourceSyntax, targetSyntax;
+
+  int mixedSyntaxType, multiLabel;
+  bool nonTermConsecSourceMixed;
+  bool hieroSourceLHS;
+  int maxSpanFreeNonTermSource;
+  bool nieceTerminal;
+  int maxScope, minScope;
+
+  // properties
+  bool spanLength;
+  bool nonTermContext;
+  bool nonTermContextTarget;
+  int nonTermContextFactor;
+
+  int numSourceFactors, numTargetFactors;
+
+  std::string scopeSpanStr;
+  std::vector<std::pair<int,int> > scopeSpan;
+
+  void SetScopeSpan(const std::string &str);
+
+};
+
--- a/phrase-extract/extract-mixed-syntax/Phrase.cpp
+++ b/phrase-extract/extract-mixed-syntax/Phrase.cpp
@ -0,0 +1,14 @@
+#include <sstream>
+#include "Phrase.h"
+
+std::string Phrase::Debug() const
+{
+	std::stringstream out;
+
+	for (size_t i = 0; i < size(); ++i) {
+		Word &word = *at(i);
+		out << word.Debug() << " ";
+	}
+
+	return out.str();
+}
--- a/phrase-extract/extract-mixed-syntax/Phrase.h
+++ b/phrase-extract/extract-mixed-syntax/Phrase.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <vector>
+#include "Word.h"
+
+// a vector of terminals
+class Phrase : public std::vector<Word*>
+{
+public:
+	Phrase()
+	{}
+
+	Phrase(size_t size)
+	:std::vector<Word*>(size)
+	 {}
+
+	std::string Debug() const;
+
+};
--- a/phrase-extract/extract-mixed-syntax/Rule.cpp
+++ b/phrase-extract/extract-mixed-syntax/Rule.cpp
@ -0,0 +1,637 @@
+/*
+ * Rule.cpp
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+
+#include <sstream>
+#include <algorithm>
+#include "Rule.h"
+#include "AlignedSentence.h"
+#include "ConsistentPhrase.h"
+#include "NonTerm.h"
+#include "Parameter.h"
+
+using namespace std;
+
+Rule::Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence)
+:m_lhs(lhsNonTerm)
+,m_alignedSentence(alignedSentence)
+,m_isValid(true)
+,m_canRecurse(true)
+{
+	CreateSource();
+}
+
+Rule::Rule(const Rule &copy, const NonTerm &nonTerm)
+:m_lhs(copy.m_lhs)
+,m_alignedSentence(copy.m_alignedSentence)
+,m_isValid(true)
+,m_canRecurse(true)
+,m_nonterms(copy.m_nonterms)
+{
+	m_nonterms.push_back(&nonTerm);
+	CreateSource();
+
+}
+
+Rule::~Rule() {
+	// TODO Auto-generated destructor stub
+}
+
+const ConsistentPhrase &Rule::GetConsistentPhrase() const
+{ return m_lhs.GetConsistentPhrase(); }
+
+void Rule::CreateSource()
+{
+  const NonTerm *cp = NULL;
+  size_t nonTermInd = 0;
+  if (nonTermInd < m_nonterms.size()) {
+	  cp = m_nonterms[nonTermInd];
+  }
+
+  for (int sourcePos = m_lhs.GetConsistentPhrase().corners[0];
+		  sourcePos <= m_lhs.GetConsistentPhrase().corners[1];
+		  ++sourcePos) {
+
+	  const RuleSymbol *ruleSymbol;
+	  if (cp && cp->GetConsistentPhrase().corners[0] <= sourcePos && sourcePos <= cp->GetConsistentPhrase().corners[1]) {
+		  // replace words with non-term
+		  ruleSymbol = cp;
+		  sourcePos = cp->GetConsistentPhrase().corners[1];
+		  if (m_nonterms.size()) {
+			  cp = m_nonterms[nonTermInd];
+		  }
+
+		  // move to next non-term
+		  ++nonTermInd;
+		  cp = (nonTermInd < m_nonterms.size()) ? m_nonterms[nonTermInd] : NULL;
+	  }
+	  else {
+		  // terminal
+		  ruleSymbol = m_alignedSentence.GetPhrase(Moses::Input)[sourcePos];
+	  }
+
+	  m_source.Add(ruleSymbol);
+  }
+}
+
+int Rule::GetNextSourcePosForNonTerm() const
+{
+	if (m_nonterms.empty()) {
+		// no non-terms so far. Can start next non-term on left corner
+		return m_lhs.GetConsistentPhrase().corners[0];
+	}
+	else {
+		// next non-term can start just left of previous
+		const ConsistentPhrase &cp = m_nonterms.back()->GetConsistentPhrase();
+		int nextPos = cp.corners[1] + 1;
+		return nextPos;
+	}
+}
+
+std::string Rule::Debug() const
+{
+  stringstream out;
+
+  // source
+  for (size_t i =  0; i < m_source.GetSize(); ++i) {
+	  const RuleSymbol &symbol = *m_source[i];
+	  out << symbol.Debug() << " ";
+  }
+
+  // target
+  out << "||| ";
+  for (size_t i =  0; i < m_target.GetSize(); ++i) {
+	  const RuleSymbol &symbol = *m_target[i];
+	  out << symbol.Debug() << " ";
+  }
+
+  out << "||| ";
+  Alignments::const_iterator iterAlign;
+  for (iterAlign =  m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
+	  const std::pair<int,int> &alignPair = *iterAlign;
+	  out << alignPair.first << "-" << alignPair.second << " ";
+  }
+
+  // overall range
+  out << "||| LHS=" << m_lhs.Debug();
+
+  return out.str();
+}
+
+void Rule::Output(std::ostream &out, bool forward, const Parameter &params) const
+{
+  if (forward) {
+	  // source
+	  m_source.Output(out);
+	  m_lhs.Output(out, Moses::Input);
+
+	  out << " ||| ";
+
+	  // target
+	  m_target.Output(out);
+	  m_lhs.Output(out, Moses::Output);
+  }
+  else {
+	  // target
+	  m_target.Output(out);
+	  m_lhs.Output(out, Moses::Output);
+
+	  out << " ||| ";
+
+	  // source
+	  m_source.Output(out);
+	  m_lhs.Output(out, Moses::Input);
+  }
+
+  out << " ||| ";
+
+  // alignment
+  Alignments::const_iterator iterAlign;
+  for (iterAlign =  m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
+	  const std::pair<int,int> &alignPair = *iterAlign;
+
+	  if (forward) {
+		  out << alignPair.first << "-" << alignPair.second << " ";
+	  }
+	  else {
+		  out << alignPair.second << "-" << alignPair.first << " ";
+	  }
+  }
+
+  out << "||| ";
+
+  // count
+  out << m_count;
+
+  out << " ||| ";
+
+  // properties
+
+  // span length
+  if (forward && params.spanLength && m_nonterms.size()) {
+	  out << "{{SpanLength ";
+
+	  for (size_t i = 0; i < m_nonterms.size(); ++i) {
+		  const NonTerm &nonTerm = *m_nonterms[i];
+		  const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
+		  out << i << "," << cp.GetWidth(Moses::Input) << "," << cp.GetWidth(Moses::Output) << " ";
+	  }
+	  out << "}} ";
+  }
+
+  // non-term context (source)
+  if (forward && params.nonTermContext && m_nonterms.size()) {
+	  out << "{{NonTermContext ";
+
+	  int factor = params.nonTermContextFactor;
+
+	  for (size_t i = 0; i < m_nonterms.size(); ++i) {
+		  const NonTerm &nonTerm = *m_nonterms[i];
+		  const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
+		  NonTermContext(1, factor, i, cp, out);
+	  }
+	  out << "}} ";
+  }
+
+  // non-term context (target)
+  if (forward && params.nonTermContextTarget && m_nonterms.size()) {
+	  out << "{{NonTermContextTarget ";
+
+	  int factor = params.nonTermContextFactor;
+
+	  for (size_t i = 0; i < m_nonterms.size(); ++i) {
+		  const NonTerm &nonTerm = *m_nonterms[i];
+		  const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
+		  NonTermContext(2, factor, i, cp, out);
+	  }
+	  out << "}} ";
+  }
+
+}
+
+void Rule::NonTermContextFactor(int factor, const Word &word, std::ostream &out) const
+{
+  out << word.GetString(factor) << " ";
+}
+
+void Rule::NonTermContext(int sourceTarget, int factor, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const
+{
+  int startPos, endPos;
+  const Phrase *phrase;
+
+  if (sourceTarget == 1) {
+	  startPos = cp.corners[0];
+	  endPos = cp.corners[1];
+	  phrase = &m_alignedSentence.GetPhrase(Moses::Input);
+  }
+  else if (sourceTarget == 2) {
+	  startPos = cp.corners[2];
+	  endPos = cp.corners[3];
+	  phrase = &m_alignedSentence.GetPhrase(Moses::Output);
+  }
+  else {
+	  abort();
+  }
+
+  out << ntInd << " ";
+
+  // left outside
+  if (startPos == 0) {
+	  out << "<s> ";
+  }
+  else {
+	NonTermContextFactor(factor, *phrase->at(startPos - 1), out);
+  }
+
+  // left inside
+  NonTermContextFactor(factor, *phrase->at(startPos), out);
+
+  // right inside
+  NonTermContextFactor(factor, *phrase->at(endPos), out);
+
+  // right outside
+  if (endPos == phrase->size() - 1) {
+	  out << "</s> ";
+  }
+  else {
+	NonTermContextFactor(factor, *phrase->at(endPos + 1), out);
+  }
+
+
+}
+
+void Rule::Prevalidate(const Parameter &params)
+{
+  const ConsistentPhrase &cp = m_lhs.GetConsistentPhrase();
+
+  // check number of source symbols in rule
+  if (m_source.GetSize() > params.maxSymbolsSource) {
+	  m_isValid = false;
+  }
+
+  // check that last non-term added isn't too small
+  if (m_nonterms.size()) {
+	  const NonTerm &lastNonTerm = *m_nonterms.back();
+	  const ConsistentPhrase &cp = lastNonTerm.GetConsistentPhrase();
+
+	  int sourceWidth = cp.corners[1]  - cp.corners[0] + 1;
+	  if (sourceWidth < params.minHoleSource) {
+		  m_isValid = false;
+		  m_canRecurse = false;
+		  return;
+	  }
+  }
+
+  // check number of non-terms
+  int numNonTerms = 0;
+  int numHieroNonTerms = 0;
+  for (size_t i = 0; i < m_source.GetSize(); ++i) {
+	  const RuleSymbol *arc = m_source[i];
+	  if (arc->IsNonTerm()) {
+		  ++numNonTerms;
+		  const NonTerm &nonTerm = *static_cast<const NonTerm*>(arc);
+		  bool isHiero = nonTerm.IsHiero(params);
+		  if (isHiero) {
+			  ++numHieroNonTerms;
+		  }
+	  }
+  }
+
+  if (numNonTerms >= params.maxNonTerm) {
+	  m_canRecurse = false;
+	  if (numNonTerms > params.maxNonTerm) {
+		  m_isValid = false;
+		  return;
+	  }
+  }
+
+  if (numHieroNonTerms >= params.maxHieroNonTerm) {
+	  m_canRecurse = false;
+	  if (numHieroNonTerms > params.maxHieroNonTerm) {
+		  m_isValid = false;
+		  return;
+	  }
+  }
+
+  // check if 2 consecutive non-terms in source
+  if (!params.nonTermConsecSource && m_nonterms.size() >= 2) {
+	  const NonTerm &lastNonTerm = *m_nonterms.back();
+	  const NonTerm &secondLastNonTerm = *m_nonterms[m_nonterms.size() - 2];
+	  if (secondLastNonTerm.GetConsistentPhrase().corners[1] + 1 ==
+			  lastNonTerm.GetConsistentPhrase().corners[0]) {
+		  if (params.mixedSyntaxType == 0) {
+			  // ordinary hiero or syntax model
+			  m_isValid = false;
+			  m_canRecurse = false;
+			  return;
+		  }
+		  else {
+			  // Hieu's mixed syntax
+			  if (lastNonTerm.IsHiero(Moses::Input, params)
+				  && secondLastNonTerm.IsHiero(Moses::Input, params)) {
+				  m_isValid = false;
+				  m_canRecurse = false;
+				  return;
+			  }
+		  }
+
+	  }
+  }
+
+  //check to see if it overlaps with any other non-terms
+  if (m_nonterms.size() >= 2) {
+	  const NonTerm &lastNonTerm = *m_nonterms.back();
+
+	  for (size_t i = 0; i < m_nonterms.size() - 1; ++i) {
+		  const NonTerm &otherNonTerm = *m_nonterms[i];
+		  bool overlap = lastNonTerm.GetConsistentPhrase().TargetOverlap(otherNonTerm.GetConsistentPhrase());
+
+		  if (overlap) {
+			  m_isValid = false;
+			  m_canRecurse = false;
+			  return;
+		  }
+	  }
+  }
+
+  // check that at least 1 word is aligned
+  if (params.requireAlignedWord) {
+	  bool ok = false;
+	  for (size_t i = 0; i < m_source.GetSize(); ++i) {
+		  const RuleSymbol &symbol = *m_source[i];
+		  if (!symbol.IsNonTerm()) {
+			  const Word &word = static_cast<const Word&>(symbol);
+			  if (word.GetAlignment().size()) {
+				  ok = true;
+				  break;
+			  }
+		  }
+	  }
+
+	  if (!ok) {
+		  m_isValid = false;
+		  m_canRecurse = false;
+		  return;
+	  }
+  }
+
+  if (params.maxSpanFreeNonTermSource) {
+	  const NonTerm *front = dynamic_cast<const NonTerm*>(m_source[0]);
+	  if (front) {
+		  int width = front->GetWidth(Moses::Input);
+		  if (width > params.maxSpanFreeNonTermSource) {
+			  m_isValid = false;
+			  m_canRecurse = false;
+			  return;
+		  }
+	  }
+
+	  const NonTerm *back = dynamic_cast<const NonTerm*>(m_source.Back());
+	  if (back) {
+		  int width = back->GetWidth(Moses::Input);
+		  if (width > params.maxSpanFreeNonTermSource) {
+			  m_isValid = false;
+			  m_canRecurse = false;
+			  return;
+		  }
+	  }
+  }
+
+  if (!params.nieceTerminal) {
+	  // collect terminal in a rule
+	  std::set<const Word*> terms;
+	  for (size_t i = 0; i < m_source.GetSize(); ++i) {
+		  const Word *word = dynamic_cast<const Word*>(m_source[i]);
+		  if (word) {
+			  terms.insert(word);
+		  }
+	  }
+
+	  // look in non-terms
+	  for (size_t i = 0; i < m_source.GetSize(); ++i) {
+		  const NonTerm *nonTerm = dynamic_cast<const NonTerm*>(m_source[i]);
+		  if (nonTerm) {
+			  const ConsistentPhrase &cp = nonTerm->GetConsistentPhrase();
+			  bool containTerm = ContainTerm(cp, terms);
+
+			  if (containTerm) {
+				  //cerr << "ruleSource=" << *ruleSource << " ";
+				  //cerr << "ntRange=" << ntRange << endl;
+
+				  // non-term contains 1 of the terms in the rule.
+				  m_isValid = false;
+				  m_canRecurse = false;
+				  return;
+			  }
+		  }
+	  }
+  }
+
+  if (params.maxScope != UNDEFINED || params.minScope > 0) {
+	  int scope = GetScope(params);
+	  if (scope > params.maxScope) {
+		  // scope of subsequent rules will be the same or increase
+		  // therefore can NOT recurse
+		  m_isValid = false;
+		  m_canRecurse = false;
+		  return;
+	  }
+
+	  if (scope < params.minScope) {
+		  // scope of subsequent rules may increase
+		  // therefore can recurse
+		  m_isValid = false;
+	  }
+  }
+
+  // min/max span per scope
+  if (params.scopeSpan.size()) {
+	  int scope = GetScope(params);
+	  if (scope >= params.scopeSpan.size()) {
+		  // no constraint on it. It's ok
+	  }
+	  else {
+		  const std::pair<int,int> &constraint = params.scopeSpan[scope];
+		  int sourceWidth = m_lhs.GetWidth(Moses::Input);
+		  if (sourceWidth < constraint.first || sourceWidth > constraint.second) {
+			  m_isValid = false;
+			  m_canRecurse = false;
+			  return;
+		  }
+	  }
+  }
+}
+
+int Rule::GetScope(const Parameter &params) const
+{
+	size_t scope = 0;
+	bool previousIsAmbiguous = false;
+
+	if (m_source[0]->IsNonTerm()) {
+		scope++;
+		previousIsAmbiguous = true;
+	}
+
+	for (size_t i = 1; i < m_source.GetSize(); ++i) {
+		const RuleSymbol *symbol = m_source[i];
+		bool isAmbiguous = symbol->IsNonTerm();
+		if (isAmbiguous) {
+			// mixed syntax
+			const NonTerm *nt = static_cast<const NonTerm*>(symbol);
+			isAmbiguous = nt->IsHiero(Moses::Input, params);
+		}
+
+		if (isAmbiguous && previousIsAmbiguous) {
+			scope++;
+		}
+		previousIsAmbiguous = isAmbiguous;
+	}
+
+	if (previousIsAmbiguous) {
+		scope++;
+	}
+
+	return scope;
+
+	/*
+  int scope = 0;
+  if (m_source.GetSize() > 1) {
+	  const RuleSymbol &front = *m_source.Front();
+	  if (front.IsNonTerm()) {
+		  ++scope;
+	  }
+
+	  const RuleSymbol &back = *m_source.Back();
+	  if (back.IsNonTerm()) {
+		  ++scope;
+	  }
+  }
+  return scope;
+  */
+}
+
+template<typename T>
+bool Contains(const T *sought, const set<const T*> &coll)
+{
+	std::set<const Word*>::const_iterator iter;
+	for (iter = coll.begin(); iter != coll.end(); ++iter) {
+		const Word *found = *iter;
+		if (sought->CompareString(*found) == 0) {
+			return true;
+		}
+	}
+	return false;
+}
+
+bool Rule::ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const
+{
+	const Phrase &sourceSentence = m_alignedSentence.GetPhrase(Moses::Input);
+
+	for (int pos = cp.corners[0]; pos <= cp.corners[1]; ++pos) {
+		const Word *soughtWord = sourceSentence[pos];
+
+		// find same word in set
+		if (Contains(soughtWord, terms)) {
+			return true;
+		}
+	}
+	return false;
+}
+
+bool CompareTargetNonTerms(const NonTerm *a, const NonTerm *b)
+{
+	// compare just start target pos
+	return a->GetConsistentPhrase().corners[2] < b->GetConsistentPhrase().corners[2];
+}
+
+void Rule::CreateTarget(const Parameter &params)
+{
+  if (!m_isValid) {
+	  return;
+  }
+
+  vector<const NonTerm*> targetNonTerm(m_nonterms);
+  std::sort(targetNonTerm.begin(), targetNonTerm.end(), CompareTargetNonTerms);
+
+  const NonTerm *cp = NULL;
+  size_t nonTermInd = 0;
+  if (nonTermInd < targetNonTerm.size()) {
+	  cp = targetNonTerm[nonTermInd];
+  }
+
+  for (int targetPos = m_lhs.GetConsistentPhrase().corners[2];
+		  targetPos <= m_lhs.GetConsistentPhrase().corners[3];
+		  ++targetPos) {
+
+	  const RuleSymbol *ruleSymbol;
+	  if (cp && cp->GetConsistentPhrase().corners[2] <= targetPos && targetPos <= cp->GetConsistentPhrase().corners[3]) {
+		  // replace words with non-term
+		  ruleSymbol = cp;
+		  targetPos = cp->GetConsistentPhrase().corners[3];
+		  if (targetNonTerm.size()) {
+			  cp = targetNonTerm[nonTermInd];
+		  }
+
+		  // move to next non-term
+		  ++nonTermInd;
+		  cp = (nonTermInd < targetNonTerm.size()) ? targetNonTerm[nonTermInd] : NULL;
+	  }
+	  else {
+		  // terminal
+		  ruleSymbol = m_alignedSentence.GetPhrase(Moses::Output)[targetPos];
+	  }
+
+	  m_target.Add(ruleSymbol);
+  }
+
+  CreateAlignments();
+}
+
+
+void Rule::CreateAlignments()
+{
+	int sourceStart = GetConsistentPhrase().corners[0];
+	int targetStart = GetConsistentPhrase().corners[2];
+
+  for (size_t sourcePos = 0; sourcePos < m_source.GetSize(); ++sourcePos) {
+	  const RuleSymbol *symbol = m_source[sourcePos];
+	  if (!symbol->IsNonTerm()) {
+		  // terminals
+		  const Word &sourceWord = static_cast<const Word&>(*symbol);
+		  const std::set<const Word *> &targetWords = sourceWord.GetAlignment();
+		  CreateAlignments(sourcePos, targetWords);
+	  }
+	  else {
+		  // non-terms. same object in both source & target
+		  CreateAlignments(sourcePos, symbol);
+	  }
+  }
+}
+
+void Rule::CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords)
+{
+	std::set<const Word *>::const_iterator iterTarget;
+	for (iterTarget = targetWords.begin(); iterTarget != targetWords.end(); ++iterTarget) {
+		const Word *targetWord = *iterTarget;
+		CreateAlignments(sourcePos, targetWord);
+	}
+}
+
+void Rule::CreateAlignments(int sourcePos, const RuleSymbol *targetSought)
+{
+	// should be in target phrase
+	for (size_t targetPos = 0; targetPos < m_target.GetSize(); ++targetPos) {
+		const RuleSymbol *foundSymbol = m_target[targetPos];
+		if (targetSought == foundSymbol) {
+			pair<int, int> alignPoint(sourcePos, targetPos);
+			m_alignments.insert(alignPoint);
+			return;
+		}
+	}
+
+	throw "not found";
+}
+
--- a/phrase-extract/extract-mixed-syntax/Rule.h
+++ b/phrase-extract/extract-mixed-syntax/Rule.h
@ -0,0 +1,90 @@
+/*
+ * Rule.h
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+#pragma once
+#include <vector>
+#include "Phrase.h"
+#include "RulePhrase.h"
+#include "moses/TypeDef.h"
+
+class ConsistentPhrase;
+class AlignedSentence;
+class NonTerm;
+class Parameter;
+
+
+class Rule {
+public:
+	typedef std::set<std::pair<int,int> > Alignments;
+
+	Rule(const Rule &copy); // do not implement
+
+	// original rule with no non-term
+	Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence);
+
+	// extend a rule, adding 1 new non-term
+	Rule(const Rule &copy, const NonTerm &nonTerm);
+
+	virtual ~Rule();
+
+	bool IsValid() const
+	{ return m_isValid; }
+
+	bool CanRecurse() const
+	{ return m_canRecurse; }
+
+	const NonTerm &GetLHS() const
+	{ return m_lhs; }
+
+	const ConsistentPhrase &GetConsistentPhrase() const;
+
+	int GetNextSourcePosForNonTerm() const;
+
+	void SetCount(float count)
+	{ m_count = count; }
+	float GetCount() const
+	{ return m_count; }
+
+	const Alignments &GetAlignments() const
+	{ return m_alignments; }
+
+	std::string Debug() const;
+	void Output(std::ostream &out, bool forward, const Parameter &params) const;
+
+	void Prevalidate(const Parameter &params);
+	void CreateTarget(const Parameter &params);
+
+	const RulePhrase &GetPhrase(Moses::FactorDirection direction) const
+	{ return (direction == Moses::Input) ? m_source : m_target; }
+
+protected:
+	const NonTerm &m_lhs;
+	const AlignedSentence &m_alignedSentence;
+	RulePhrase m_source, m_target;
+	float m_count;
+
+	Alignments m_alignments;
+
+	// in source order
+	std::vector<const NonTerm*> m_nonterms;
+
+	bool m_isValid, m_canRecurse;
+
+	void CreateSource();
+	void CreateAlignments();
+	void CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords);
+	void CreateAlignments(int sourcePos, const RuleSymbol *targetSought);
+
+	bool ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const;
+	int GetScope(const Parameter &params) const;
+
+	void NonTermContext(int sourceTarget, int factors, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const;
+		// sourceTarget: 1 = source, 2 = target
+
+	void NonTermContextFactor(int factor, const Word &word, std::ostream &out) const;
+
+};
+
--- a/phrase-extract/extract-mixed-syntax/RulePhrase.cpp
+++ b/phrase-extract/extract-mixed-syntax/RulePhrase.cpp
@ -0,0 +1,50 @@
+/*
+ * RulePhrase.cpp
+ *
+ *  Created on: 26 Feb 2014
+ *      Author: hieu
+ */
+
+#include <sstream>
+#include "RulePhrase.h"
+#include "RuleSymbol.h"
+
+using namespace std;
+
+extern bool g_debug;
+
+int RulePhrase::Compare(const RulePhrase &other) const
+{
+  if (GetSize() != other.GetSize()) {
+	return GetSize() < other.GetSize() ? -1 : +1;
+  }
+
+  for (size_t i = 0; i < m_coll.size(); ++i) {
+	  const RuleSymbol &symbol = *m_coll[i];
+	  const RuleSymbol &otherSymbol = *other.m_coll[i];
+	  int compare = symbol.Compare(otherSymbol);
+
+	  if (compare) {
+		  return compare;
+	  }
+  }
+
+  return 0;
+}
+
+void RulePhrase::Output(std::ostream &out) const
+{
+  for (size_t i =  0; i < m_coll.size(); ++i) {
+	  const RuleSymbol &symbol = *m_coll[i];
+	  symbol.Output(out);
+	  out << " ";
+  }
+}
+
+std::string RulePhrase::Debug() const
+{
+	std::stringstream out;
+	Output(out);
+	return out.str();
+}
+
--- a/phrase-extract/extract-mixed-syntax/RulePhrase.h
+++ b/phrase-extract/extract-mixed-syntax/RulePhrase.h
@ -0,0 +1,49 @@
+/*
+ * RulePhrase.h
+ *
+ *  Created on: 26 Feb 2014
+ *      Author: hieu
+ */
+
+#ifndef RULEPHRASE_H_
+#define RULEPHRASE_H_
+
+#include <vector>
+#include <cstddef>
+#include <iostream>
+
+class RuleSymbol;
+
+// a phrase of terms and non-terms for 1 side of a rule
+class RulePhrase
+{
+public:
+  typedef std::vector<const RuleSymbol*> Coll;
+  Coll m_coll;
+
+  size_t GetSize() const
+  { return m_coll.size(); }
+
+  void Add(const RuleSymbol *symbol)
+  {
+	  m_coll.push_back(symbol);
+  }
+
+  const RuleSymbol* operator[](size_t index) const {
+    return m_coll[index];
+  }
+
+  const RuleSymbol* Front() const {
+    return m_coll.front();
+  }
+  const RuleSymbol* Back() const {
+    return m_coll.back();
+  }
+
+  int Compare(const RulePhrase &other) const;
+
+  void Output(std::ostream &out) const;
+  std::string Debug() const;
+};
+
+#endif /* RULEPHRASE_H_ */
--- a/phrase-extract/extract-mixed-syntax/RuleSymbol.cpp
+++ b/phrase-extract/extract-mixed-syntax/RuleSymbol.cpp
@ -0,0 +1,36 @@
+/*
+ * RuleSymbol.cpp
+ *
+ *  Created on: 21 Feb 2014
+ *      Author: hieu
+ */
+
+#include "RuleSymbol.h"
+
+using namespace std;
+
+RuleSymbol::RuleSymbol() {
+	// TODO Auto-generated constructor stub
+
+}
+
+RuleSymbol::~RuleSymbol() {
+	// TODO Auto-generated destructor stub
+}
+
+int RuleSymbol::Compare(const RuleSymbol &other) const
+{
+	if (IsNonTerm() != other.IsNonTerm()) {
+		return IsNonTerm() ? -1 : +1;
+	}
+
+	string str = GetString();
+	string otherStr = other.GetString();
+
+	if (str == otherStr) {
+		return 0;
+	}
+	else {
+		return  (str < otherStr) ? -1 : +1;
+	}
+}
--- a/phrase-extract/extract-mixed-syntax/RuleSymbol.h
+++ b/phrase-extract/extract-mixed-syntax/RuleSymbol.h
@ -0,0 +1,31 @@
+/*
+ * RuleSymbol.h
+ *
+ *  Created on: 21 Feb 2014
+ *      Author: hieu
+ */
+
+#ifndef RULESYMBOL_H_
+#define RULESYMBOL_H_
+
+#include <iostream>
+#include <string>
+
+// base class - terminal or non-term
+class RuleSymbol {
+public:
+	RuleSymbol();
+	virtual ~RuleSymbol();
+
+	virtual bool IsNonTerm() const = 0;
+
+	virtual std::string Debug() const = 0;
+	virtual void Output(std::ostream &out) const = 0;
+
+	virtual std::string GetString() const = 0;
+
+	int Compare(const RuleSymbol &other) const;
+
+};
+
+#endif /* RULESYMBOL_H_ */
--- a/phrase-extract/extract-mixed-syntax/Rules.cpp
+++ b/phrase-extract/extract-mixed-syntax/Rules.cpp
@ -0,0 +1,227 @@
+/*
+ * Rules.cpp
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+
+#include <sstream>
+#include "Rules.h"
+#include "ConsistentPhrase.h"
+#include "ConsistentPhrases.h"
+#include "AlignedSentence.h"
+#include "Rule.h"
+#include "Parameter.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+extern bool g_debug;
+
+Rules::Rules(const AlignedSentence &alignedSentence)
+:m_alignedSentence(alignedSentence)
+{
+}
+
+Rules::~Rules() {
+	Moses::RemoveAllInColl(m_keepRules);
+}
+
+void Rules::CreateRules(const ConsistentPhrase &cp,
+		const Parameter &params)
+{
+	if (params.hieroSourceLHS) {
+		const NonTerm &nonTerm = cp.GetHieroNonTerm();
+		CreateRule(nonTerm, params);
+	}
+	else {
+		const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
+		for (size_t i = 0; i < nonTerms.size(); ++i) {
+			const NonTerm &nonTerm = nonTerms[i];
+			CreateRule(nonTerm, params);
+		}
+	}
+}
+
+void Rules::CreateRule(const NonTerm &nonTerm,
+		const Parameter &params)
+{
+	Rule *rule = new Rule(nonTerm, m_alignedSentence);
+
+	rule->Prevalidate(params);
+	rule->CreateTarget(params);
+
+
+	if (rule->CanRecurse()) {
+		Extend(*rule, params);
+	}
+
+	if (rule->IsValid()) {
+		m_keepRules.insert(rule);
+	}
+	else {
+		delete rule;
+	}
+
+}
+
+void Rules::Extend(const Parameter &params)
+{
+	const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
+
+	size_t size = m_alignedSentence.GetPhrase(Moses::Input).size();
+	for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
+		for (size_t sourceEnd = sourceStart; sourceEnd < size; ++sourceEnd) {
+			const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
+
+			ConsistentPhrases::Coll::const_iterator iter;
+			for (iter = cps.begin(); iter != cps.end(); ++iter) {
+				const ConsistentPhrase &cp = **iter;
+				CreateRules(cp, params);
+			}
+		}
+	}
+}
+
+void Rules::Extend(const Rule &rule, const Parameter &params)
+{
+	const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
+	int sourceMin = rule.GetNextSourcePosForNonTerm();
+
+	int ruleStart = rule.GetConsistentPhrase().corners[0];
+	int ruleEnd = rule.GetConsistentPhrase().corners[1];
+
+	for (int sourceStart = sourceMin; sourceStart <= ruleEnd; ++sourceStart) {
+		for (int sourceEnd = sourceStart; sourceEnd <= ruleEnd; ++sourceEnd) {
+			if (sourceStart == ruleStart && sourceEnd == ruleEnd) {
+				// don't cover whole rule with 1 non-term
+				continue;
+			}
+
+			const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
+			Extend(rule, cps, params);
+		}
+	}
+}
+
+void Rules::Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params)
+{
+	ConsistentPhrases::Coll::const_iterator iter;
+	for (iter = cps.begin(); iter != cps.end(); ++iter) {
+		const ConsistentPhrase &cp = **iter;
+		Extend(rule, cp, params);
+	}
+}
+
+void Rules::Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params)
+{
+	const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
+	for (size_t i = 0; i < nonTerms.size(); ++i) {
+		const NonTerm &nonTerm = nonTerms[i];
+
+		Rule *newRule = new Rule(rule, nonTerm);
+		newRule->Prevalidate(params);
+		newRule->CreateTarget(params);
+
+		if (newRule->CanRecurse()) {
+			// recursively extend
+			Extend(*newRule, params);
+		}
+
+		if (newRule->IsValid()) {
+			m_keepRules.insert(newRule);
+		}
+		else {
+			delete newRule;
+		}
+	}
+}
+
+std::string Rules::Debug() const
+{
+	stringstream out;
+
+	std::set<Rule*>::const_iterator iter;
+	out << "m_keepRules:" << endl;
+	for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
+		const Rule &rule = **iter;
+		out << rule.Debug() << endl;
+	}
+
+	return out.str();
+}
+
+void Rules::Output(std::ostream &out, bool forward, const Parameter &params) const
+{
+	std::set<Rule*, CompareRules>::const_iterator iter;
+	for (iter = m_mergeRules.begin(); iter != m_mergeRules.end(); ++iter) {
+		const Rule &rule = **iter;
+		rule.Output(out, forward, params);
+		out << endl;
+	}
+}
+
+void Rules::Consolidate(const Parameter &params)
+{
+	if (params.fractionalCounting) {
+		CalcFractionalCount();
+	}
+	else {
+		std::set<Rule*>::iterator iter;
+		for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
+			Rule &rule = **iter;
+			rule.SetCount(1);
+		}
+	}
+
+	MergeRules(params);
+}
+
+void Rules::MergeRules(const Parameter &params)
+{
+	typedef std::set<Rule*, CompareRules> MergeRules;
+
+	std::set<Rule*>::const_iterator iterOrig;
+	for (iterOrig = m_keepRules.begin(); iterOrig != m_keepRules.end(); ++iterOrig) {
+		Rule *origRule = *iterOrig;
+
+		pair<MergeRules::iterator, bool> inserted = m_mergeRules.insert(origRule);
+		if (!inserted.second) {
+			// already there, just add count
+			Rule &rule = **inserted.first;
+			float newCount = rule.GetCount() + origRule->GetCount();
+			rule.SetCount(newCount);
+		}
+	}
+}
+
+void Rules::CalcFractionalCount()
+{
+  typedef std::set<Rule*> RuleColl;
+  typedef std::map<const ConsistentPhrase*, RuleColl> RuleByConsistentPhrase;
+  RuleByConsistentPhrase allRules;
+
+  // sort by source AND target ranges
+  std::set<Rule*>::const_iterator iter;
+  for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
+	Rule *rule = *iter;
+	const ConsistentPhrase &cp = rule->GetConsistentPhrase();
+	RuleColl &ruleColl =  allRules[&cp];
+	ruleColl.insert(rule);
+  }
+
+  // fractional count
+  RuleByConsistentPhrase::iterator iterOuter;
+  for (iterOuter = allRules.begin(); iterOuter != allRules.end(); ++iterOuter) {
+	  RuleColl &rules = iterOuter->second;
+
+	  RuleColl::iterator iterInner;
+	  for (iterInner = rules.begin(); iterInner != rules.end(); ++iterInner) {
+		  Rule &rule = **iterInner;
+		  rule.SetCount(1.0f / (float) rules.size());
+	  }
+  }
+
+}
+
+
--- a/phrase-extract/extract-mixed-syntax/Rules.h
+++ b/phrase-extract/extract-mixed-syntax/Rules.h
@ -0,0 +1,72 @@
+/*
+ * Rules.h
+ *
+ *  Created on: 20 Feb 2014
+ *      Author: hieu
+ */
+
+#pragma once
+
+#include <set>
+#include <iostream>
+#include "ConsistentPhrases.h"
+#include "Rule.h"
+
+extern bool g_debug;
+
+class AlignedSentence;
+class Parameter;
+
+struct CompareRules {
+	bool operator()(const Rule *a, const Rule *b)
+	{
+		int compare;
+
+		compare = a->GetPhrase(Moses::Input).Compare(b->GetPhrase(Moses::Input));
+		if (compare) return compare < 0;
+
+		compare = a->GetPhrase(Moses::Output).Compare(b->GetPhrase(Moses::Output));
+		if (compare) return compare < 0;
+
+		if (a->GetAlignments() != b->GetAlignments()) {
+			return a->GetAlignments() < b->GetAlignments();
+		}
+
+		if (a->GetLHS().GetString() != b->GetLHS().GetString()) {
+			return a->GetLHS().GetString() < b->GetLHS().GetString();
+		}
+
+		return false;
+	}
+};
+
+class Rules {
+public:
+	Rules(const AlignedSentence &alignedSentence);
+	virtual ~Rules();
+	void Extend(const Parameter &params);
+	void Consolidate(const Parameter &params);
+
+	std::string Debug() const;
+	void Output(std::ostream &out, bool forward, const Parameter &params) const;
+
+protected:
+	const AlignedSentence &m_alignedSentence;
+	std::set<Rule*> m_keepRules;
+	std::set<Rule*, CompareRules> m_mergeRules;
+
+	void Extend(const Rule &rule, const Parameter &params);
+	void Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params);
+	void Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params);
+
+	// create original rules
+	void CreateRules(const ConsistentPhrase &cp,
+			const Parameter &params);
+	void CreateRule(const NonTerm &nonTerm,
+			const Parameter &params);
+
+	void MergeRules(const Parameter &params);
+	void CalcFractionalCount();
+
+};
+
--- a/phrase-extract/extract-mixed-syntax/SyntaxTree.cpp
+++ b/phrase-extract/extract-mixed-syntax/SyntaxTree.cpp
@ -0,0 +1,47 @@
+#include <cassert>
+#include <iostream>
+#include "SyntaxTree.h"
+#include "Parameter.h"
+
+using namespace std;
+
+void SyntaxTree::Add(int startPos, int endPos, const std::string &label, const Parameter &params)
+{
+	//cerr << "add " << label << " to " << "[" << startPos << "-" << endPos << "]" << endl;
+
+	Range range(startPos, endPos);
+	Labels &labels = m_coll[range];
+
+	bool add = true;
+	if (labels.size()) {
+		if (params.multiLabel == 1) {
+			// delete the label in collection and add new
+			assert(labels.size() == 1);
+			labels.clear();
+		}
+		else if (params.multiLabel == 2) {
+			// ignore this label
+			add = false;
+		}
+	}
+
+	if (add) {
+		labels.push_back(label);
+	}
+}
+
+void SyntaxTree::AddToAll(const std::string &label)
+{
+	Coll::iterator iter;
+	for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
+		Labels &labels = iter->second;
+		labels.push_back(label);
+	}
+}
+
+const SyntaxTree::Labels &SyntaxTree::Find(int startPos, int endPos) const
+{
+	Coll::const_iterator iter;
+	iter = m_coll.find(Range(startPos, endPos));
+	return (iter == m_coll.end()) ? m_defaultLabels : iter->second;
+}
--- a/phrase-extract/extract-mixed-syntax/SyntaxTree.h
+++ b/phrase-extract/extract-mixed-syntax/SyntaxTree.h
@ -0,0 +1,32 @@
+#pragma once 
+
+#include <vector>
+#include <map>
+#include <string>
+
+class Parameter;
+
+class SyntaxTree
+{
+public:
+  typedef std::pair<int, int> Range;
+  typedef std::vector<std::string> Labels;
+  typedef std::map<Range, Labels> Coll;
+
+  void Add(int startPos, int endPos, const std::string &label, const Parameter &params);
+  void AddToAll(const std::string &label);
+
+  const Labels &Find(int startPos, int endPos) const;
+
+  void SetHieroLabel(const std::string &label) {
+	  m_defaultLabels.push_back(label);
+  }
+
+
+protected:
+
+  Coll m_coll;
+  Labels m_defaultLabels;
+};
+
+
--- a/phrase-extract/extract-mixed-syntax/Word.cpp
+++ b/phrase-extract/extract-mixed-syntax/Word.cpp
@ -0,0 +1,68 @@
+/*
+ * Word.cpp
+ *
+ *  Created on: 18 Feb 2014
+ *      Author: s0565741
+ */
+#include <limits>
+#include "Word.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+Word::Word(int pos, const std::string &str)
+:m_pos(pos)
+,m_str(str)
+{
+	// TODO Auto-generated constructor stub
+
+}
+
+Word::~Word() {
+	// TODO Auto-generated destructor stub
+}
+
+void Word::AddAlignment(const Word *other)
+{
+	m_alignment.insert(other);
+}
+
+std::set<int> Word::GetAlignmentIndex() const
+{
+	std::set<int> ret;
+
+	std::set<const Word *>::const_iterator iter;
+	for (iter = m_alignment.begin(); iter != m_alignment.end(); ++iter) {
+		const Word &otherWord = **iter;
+		int otherPos = otherWord.GetPos();
+		ret.insert(otherPos);
+	}
+
+	return ret;
+}
+
+void Word::Output(std::ostream &out) const
+{
+	out << m_str;
+}
+
+std::string Word::Debug() const
+{
+	return m_str;
+}
+
+int Word::CompareString(const Word &other) const
+{
+  return m_str.compare(other.m_str);
+}
+
+std::string Word::GetString(int factor) const
+{
+  vector<string> toks;
+  Moses::Tokenize(toks, m_str, "|");
+
+  assert(factor < toks.size());
+  return toks[factor];
+}
+
+
--- a/phrase-extract/extract-mixed-syntax/Word.h
+++ b/phrase-extract/extract-mixed-syntax/Word.h
@ -0,0 +1,49 @@
+/*
+ * Word.h
+ *
+ *  Created on: 18 Feb 2014
+ *      Author: s0565741
+ */
+#pragma once
+
+#include <string>
+#include <set>
+#include "RuleSymbol.h"
+
+// a terminal
+class Word : public RuleSymbol
+{
+public:
+	Word(const Word&); // do not implement
+	Word(int pos, const std::string &str);
+	virtual ~Word();
+
+	virtual bool IsNonTerm() const
+	{ return false; }
+
+	std::string GetString() const
+	{ return m_str; }
+
+	std::string GetString(int factor) const;
+
+	int GetPos() const
+	{ return m_pos; }
+
+	void AddAlignment(const Word *other);
+
+	const std::set<const Word *> &GetAlignment() const
+	{ return m_alignment; }
+
+	std::set<int> GetAlignmentIndex() const;
+
+	void Output(std::ostream &out) const;
+	std::string Debug() const;
+
+	int CompareString(const Word &other) const;
+
+protected:
+	int m_pos; // original position in sentence, NOT in lattice
+	std::string m_str;
+	std::set<const Word *> m_alignment;
+};
+
--- a/phrase-extract/extract-mixed-syntax/gzfilebuf.h
+++ b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
@ -0,0 +1,81 @@
+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+
+class gzfilebuf : public std::streambuf {
+public:
+  gzfilebuf(const char *filename)
+  { _gzf = gzopen(filename, "rb"); 
+    setg (_buff+sizeof(int),     // beginning of putback area
+          _buff+sizeof(int),     // read position
+          _buff+sizeof(int));    // end position
+  }
+  ~gzfilebuf() { gzclose(_gzf); }
+protected:
+  virtual int_type overflow (int_type c) {
+		throw;
+  }
+	
+  // write multiple characters
+  virtual
+  std::streamsize xsputn (const char* s,
+                          std::streamsize num) {
+		throw;
+  }
+	
+  virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
+  }
+	
+  //read one character
+  virtual int_type underflow () {
+    // is read position before end of _buff?
+		if (gptr() < egptr()) {
+			return traits_type::to_int_type(*gptr());
+		}
+		
+		/* process size of putback area
+		 * - use number of characters read
+		 * - but at most four
+		 */
+		unsigned int numPutback = gptr() - eback();
+		if (numPutback > sizeof(int)) {
+			numPutback = sizeof(int);
+		}
+		
+		/* copy up to four characters previously read into
+		 * the putback _buff (area of first four characters)
+		 */
+		std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+									numPutback);
+		
+		// read new characters
+		int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+		if (num <= 0) {
+			// ERROR or EOF
+			return EOF;
+		}
+		
+		// reset _buff pointers
+		setg (_buff+(sizeof(int)-numPutback),   // beginning of putback area
+					_buff+sizeof(int),                // read position
+					_buff+sizeof(int)+num);           // end of buffer
+		
+		// return next character
+		return traits_type::to_int_type(*gptr());
+  }
+	
+  std::streamsize xsgetn (char* s,
+                          std::streamsize num) {
+    return gzread(_gzf,s,num);
+  }
+	
+private:
+  gzFile _gzf;
+  static const unsigned int _buffsize = 1024;
+  char _buff[_buffsize];
+};
+
+#endif
--- a/phrase-extract/extract-mixed-syntax/pugixml.cpp
+++ b/phrase-extract/extract-mixed-syntax/pugixml.cpp