update extract-mixed-syntax

This commit is contained in:
Hieu Hoang 2015-01-15 09:53:57 +00:00
parent 6d61db28fa
commit 6289b39fd8
33 changed files with 10603 additions and 9738 deletions

View File

@ -15,100 +15,99 @@ using namespace std;
/////////////////////////////////////////////////////////////////////////////////
AlignedSentence::AlignedSentence(int lineNum,
const std::string &source,
const std::string &target,
const std::string &alignment)
:m_lineNum(lineNum)
const std::string &source,
const std::string &target,
const std::string &alignment)
:m_lineNum(lineNum)
{
PopulateWordVec(m_source, source);
PopulateWordVec(m_target, target);
PopulateAlignment(alignment);
PopulateWordVec(m_source, source);
PopulateWordVec(m_target, target);
PopulateAlignment(alignment);
}
AlignedSentence::~AlignedSentence()
{
Moses::RemoveAllInColl(m_source);
Moses::RemoveAllInColl(m_target);
AlignedSentence::~AlignedSentence() {
Moses::RemoveAllInColl(m_source);
Moses::RemoveAllInColl(m_target);
}
void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line)
{
std::vector<string> toks;
Moses::Tokenize(toks, line);
std::vector<string> toks;
Moses::Tokenize(toks, line);
vec.resize(toks.size());
for (size_t i = 0; i < vec.size(); ++i) {
const string &tok = toks[i];
Word *word = new Word(i, tok);
vec[i] = word;
}
vec.resize(toks.size());
for (size_t i = 0; i < vec.size(); ++i) {
const string &tok = toks[i];
Word *word = new Word(i, tok);
vec[i] = word;
}
}
void AlignedSentence::PopulateAlignment(const std::string &line)
{
vector<string> alignStr;
Moses::Tokenize(alignStr, line);
vector<string> alignStr;
Moses::Tokenize(alignStr, line);
for (size_t i = 0; i < alignStr.size(); ++i) {
vector<int> alignPair;
Moses::Tokenize(alignPair, alignStr[i], "-");
assert(alignPair.size() == 2);
for (size_t i = 0; i < alignStr.size(); ++i) {
vector<int> alignPair;
Moses::Tokenize(alignPair, alignStr[i], "-");
assert(alignPair.size() == 2);
int sourcePos = alignPair[0];
int targetPos = alignPair[1];
int sourcePos = alignPair[0];
int targetPos = alignPair[1];
if (sourcePos >= m_source.size()) {
cerr << "ERROR1:AlignedSentence=" << Debug() << endl;
cerr << "m_source=" << m_source.size() << endl;
abort();
}
assert(sourcePos < m_source.size());
assert(targetPos < m_target.size());
Word *sourceWord = m_source[sourcePos];
Word *targetWord = m_target[targetPos];
if (sourcePos >= m_source.size()) {
cerr << "ERROR1:AlignedSentence=" << Debug() << endl;
cerr << "m_source=" << m_source.size() << endl;
abort();
}
assert(sourcePos < m_source.size());
assert(targetPos < m_target.size());
Word *sourceWord = m_source[sourcePos];
Word *targetWord = m_target[targetPos];
sourceWord->AddAlignment(targetWord);
targetWord->AddAlignment(sourceWord);
}
sourceWord->AddAlignment(targetWord);
targetWord->AddAlignment(sourceWord);
}
}
std::string AlignedSentence::Debug() const
{
stringstream out;
out << "m_lineNum:";
out << m_lineNum;
out << endl;
out << "m_lineNum:";
out << m_lineNum;
out << endl;
out << "m_source:";
out << m_source.Debug();
out << endl;
out << "m_source:";
out << m_source.Debug();
out << endl;
out << "m_target:";
out << m_target.Debug();
out << endl;
out << "m_target:";
out << m_target.Debug();
out << endl;
out << "consistent phrases:" << endl;
out << m_consistentPhrases.Debug();
out << endl;
out << "consistent phrases:" << endl;
out << m_consistentPhrases.Debug();
out << endl;
return out.str();
return out.str();
}
std::vector<int> AlignedSentence::GetSourceAlignmentCount() const
{
vector<int> ret(m_source.size());
vector<int> ret(m_source.size());
for (size_t i = 0; i < m_source.size(); ++i) {
const Word &word = *m_source[i];
ret[i] = word.GetAlignmentIndex().size();
}
return ret;
for (size_t i = 0; i < m_source.size(); ++i) {
const Word &word = *m_source[i];
ret[i] = word.GetAlignmentIndex().size();
}
return ret;
}
void AlignedSentence::Create(const Parameter &params)
{
CreateConsistentPhrases(params);
m_consistentPhrases.AddHieroNonTerms(params);
CreateConsistentPhrases(params);
m_consistentPhrases.AddHieroNonTerms(params);
}
void AlignedSentence::CreateConsistentPhrases(const Parameter &params)
@ -120,76 +119,76 @@ void AlignedSentence::CreateConsistentPhrases(const Parameter &params)
// check alignments for target phrase startT...endT
for(int lengthT=1;
lengthT <= params.maxSpan && lengthT <= countT;
lengthT++) {
for(int startT=0; startT < countT-(lengthT-1); startT++) {
lengthT <= params.maxSpan && lengthT <= countT;
lengthT++) {
for(int startT=0; startT < countT-(lengthT-1); startT++) {
// that's nice to have
int endT = startT + lengthT - 1;
// that's nice to have
int endT = startT + lengthT - 1;
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
vector< int > usedS = GetSourceAlignmentCount();
for(int ti=startT; ti<=endT; ti++) {
const Word &word = *m_target[ti];
const std::set<int> &alignment = word.GetAlignmentIndex();
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
vector< int > usedS = GetSourceAlignmentCount();
for(int ti=startT; ti<=endT; ti++) {
const Word &word = *m_target[ti];
const std::set<int> &alignment = word.GetAlignmentIndex();
std::set<int>::const_iterator iterAlign;
for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) {
int si = *iterAlign;
if (si<minS) {
minS = si;
}
if (si>maxS) {
maxS = si;
}
usedS[ si ]--;
}
}
std::set<int>::const_iterator iterAlign;
for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) {
int si = *iterAlign;
if (si<minS) {
minS = si;
}
if (si>maxS) {
maxS = si;
}
usedS[ si ]--;
}
}
// unaligned phrases are not allowed
if( maxS == -1 )
continue;
// unaligned phrases are not allowed
if( maxS == -1 )
continue;
// source phrase has to be within limits
size_t width = maxS - minS + 1;
// source phrase has to be within limits
size_t width = maxS - minS + 1;
if( width < params.minSpan )
continue;
if( width < params.minSpan )
continue;
if( width > params.maxSpan )
continue;
if( width > params.maxSpan )
continue;
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
for(int si=minS; si<=maxS && !out_of_bounds; si++)
if (usedS[si]>0) {
out_of_bounds = true;
}
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
for(int si=minS; si<=maxS && !out_of_bounds; si++)
if (usedS[si]>0) {
out_of_bounds = true;
}
// if out of bound, you gotta go
if (out_of_bounds)
continue;
// if out of bound, you gotta go
if (out_of_bounds)
continue;
// done with all the checks, lets go over all consistent phrase pairs
// start point of source phrase may retreat over unaligned
for(int startS=minS;
(startS>=0 &&
startS>maxS - params.maxSpan && // within length limit
(startS==minS || m_source[startS]->GetAlignment().size()==0)); // unaligned
startS--) {
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + params.maxSpan && // within length limit
(endS==maxS || m_source[endS]->GetAlignment().size()==0)); // unaligned
endS++) {
// done with all the checks, lets go over all consistent phrase pairs
// start point of source phrase may retreat over unaligned
for(int startS=minS;
(startS>=0 &&
startS>maxS - params.maxSpan && // within length limit
(startS==minS || m_source[startS]->GetAlignment().size()==0)); // unaligned
startS--) {
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + params.maxSpan && // within length limit
(endS==maxS || m_source[endS]->GetAlignment().size()==0)); // unaligned
endS++) {
// take note that this is a valid phrase alignment
m_consistentPhrases.Add(startS, endS, startT, endT, params);
}
}
}
// take note that this is a valid phrase alignment
m_consistentPhrases.Add(startS, endS, startT, endT, params);
}
}
}
}
}

View File

@ -14,41 +14,38 @@
class Parameter;
class AlignedSentence
{
class AlignedSentence {
public:
AlignedSentence(int lineNum)
:m_lineNum(lineNum) {
}
AlignedSentence(int lineNum)
:m_lineNum(lineNum)
{}
AlignedSentence(int lineNum,
const std::string &source,
const std::string &target,
const std::string &alignment);
virtual ~AlignedSentence();
virtual void Create(const Parameter &params);
AlignedSentence(int lineNum,
const std::string &source,
const std::string &target,
const std::string &alignment);
virtual ~AlignedSentence();
virtual void Create(const Parameter &params);
const Phrase &GetPhrase(Moses::FactorDirection direction) const {
return (direction == Moses::Input) ? m_source : m_target;
}
const Phrase &GetPhrase(Moses::FactorDirection direction) const
{ return (direction == Moses::Input) ? m_source : m_target; }
const ConsistentPhrases &GetConsistentPhrases() const {
return m_consistentPhrases;
}
const ConsistentPhrases &GetConsistentPhrases() const
{ return m_consistentPhrases; }
virtual std::string Debug() const;
virtual std::string Debug() const;
int m_lineNum;
int m_lineNum;
protected:
Phrase m_source, m_target;
ConsistentPhrases m_consistentPhrases;
void CreateConsistentPhrases(const Parameter &params);
void PopulateWordVec(Phrase &vec, const std::string &line);
void CreateConsistentPhrases(const Parameter &params);
void PopulateWordVec(Phrase &vec, const std::string &line);
// m_source and m_target MUST be populated before calling this
void PopulateAlignment(const std::string &line);
std::vector<int> GetSourceAlignmentCount() const;
// m_source and m_target MUST be populated before calling this
void PopulateAlignment(const std::string &line);
std::vector<int> GetSourceAlignmentCount() const;
};

View File

@ -13,170 +13,171 @@
using namespace std;
AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum,
const std::string &source,
const std::string &target,
const std::string &alignment)
:AlignedSentence(lineNum)
,m_sourceStr(source)
,m_targetStr(target)
,m_alignmentStr(alignment)
const std::string &source,
const std::string &target,
const std::string &alignment)
:AlignedSentence(lineNum)
,m_sourceStr(source)
,m_targetStr(target)
,m_alignmentStr(alignment)
{
}
AlignedSentenceSyntax::~AlignedSentenceSyntax()
{
// TODO Auto-generated destructor stub
AlignedSentenceSyntax::~AlignedSentenceSyntax() {
// TODO Auto-generated destructor stub
}
void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
string line, Phrase &phrase, SyntaxTree &tree)
string line, Phrase &phrase, SyntaxTree &tree)
{
// parse source and target string
if (isSyntax) {
line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
XMLParse(phrase, tree, line, params);
// parse source and target string
if (isSyntax) {
line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
XMLParse(phrase, tree, line, params);
if (mixedSyntaxType != 0) {
// mixed syntax. Always add [X] where there isn't 1
tree.SetHieroLabel(params.hieroNonTerm);
if (mixedSyntaxType == 2) {
tree.AddToAll(params.hieroNonTerm);
}
}
} else {
PopulateWordVec(phrase, line);
tree.SetHieroLabel(params.hieroNonTerm);
}
if (mixedSyntaxType != 0) {
// mixed syntax. Always add [X] where there isn't 1
tree.SetHieroLabel(params.hieroNonTerm);
if (mixedSyntaxType == 2) {
tree.AddToAll(params.hieroNonTerm);
}
}
}
else {
PopulateWordVec(phrase, line);
tree.SetHieroLabel(params.hieroNonTerm);
}
}
void AlignedSentenceSyntax::Create(const Parameter &params)
{
Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr,
m_source, m_sourceTree);
Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr,
m_target, m_targetTree);
Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr,
m_source, m_sourceTree);
Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr,
m_target, m_targetTree);
PopulateAlignment(m_alignmentStr);
CreateConsistentPhrases(params);
PopulateAlignment(m_alignmentStr);
CreateConsistentPhrases(params);
// create labels
CreateNonTerms();
// create labels
CreateNonTerms();
}
void Escape(string &text)
{
text = Moses::Replace(text, "&", "&amp;");
text = Moses::Replace(text, "|", "&#124;");
text = Moses::Replace(text, "<", "&lt;");
text = Moses::Replace(text, ">", "&gt;");
text = Moses::Replace(text, "'", "&apos;");
text = Moses::Replace(text, "\"", "&quot;");
text = Moses::Replace(text, "[", "&#91;");
text = Moses::Replace(text, "]", "&#93;");
text = Moses::Replace(text, "&", "&amp;");
text = Moses::Replace(text, "|", "&#124;");
text = Moses::Replace(text, "<", "&lt;");
text = Moses::Replace(text, ">", "&gt;");
text = Moses::Replace(text, "'", "&apos;");
text = Moses::Replace(text, "\"", "&quot;");
text = Moses::Replace(text, "[", "&#91;");
text = Moses::Replace(text, "]", "&#93;");
}
void AlignedSentenceSyntax::XMLParse(Phrase &output,
SyntaxTree &tree,
const pugi::xml_node &parentNode,
const Parameter &params)
SyntaxTree &tree,
const pugi::xml_node &parentNode,
const Parameter &params)
{
int childNum = 0;
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) {
string nodeName = childNode.name();
int childNum = 0;
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling())
{
string nodeName = childNode.name();
// span label
string label;
int startPos = output.size();
// span label
string label;
int startPos = output.size();
if (!nodeName.empty()) {
pugi::xml_attribute attribute = childNode.attribute("label");
label = attribute.as_string();
if (!nodeName.empty()) {
pugi::xml_attribute attribute = childNode.attribute("label");
label = attribute.as_string();
// recursively call this function. For proper recursive trees
XMLParse(output, tree, childNode, params);
// recursively call this function. For proper recursive trees
XMLParse(output, tree, childNode, params);
}
// fill phrase vector
string text = childNode.value();
Escape(text);
//cerr << childNum << " " << label << "=" << text << endl;
std::vector<string> toks;
Moses::Tokenize(toks, text);
for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
Word *word = new Word(output.size(), tok);
output.push_back(word);
}
// is it a labelled span?
int endPos = output.size() - 1;
// fill syntax labels
if (!label.empty()) {
label = "[" + label + "]";
tree.Add(startPos, endPos, label, params);
}
++childNum;
}
// fill phrase vector
string text = childNode.value();
Escape(text);
//cerr << childNum << " " << label << "=" << text << endl;
std::vector<string> toks;
Moses::Tokenize(toks, text);
for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
Word *word = new Word(output.size(), tok);
output.push_back(word);
}
// is it a labelled span?
int endPos = output.size() - 1;
// fill syntax labels
if (!label.empty()) {
label = "[" + label + "]";
tree.Add(startPos, endPos, label, params);
}
++childNum;
}
}
void AlignedSentenceSyntax::XMLParse(Phrase &output,
SyntaxTree &tree,
const std::string input,
const Parameter &params)
SyntaxTree &tree,
const std::string input,
const Parameter &params)
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load(input.c_str(),
pugi::parse_default | pugi::parse_comments);
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load(input.c_str(),
pugi::parse_default | pugi::parse_comments);
pugi::xml_node topNode = doc.child("xml");
XMLParse(output, tree, topNode, params);
pugi::xml_node topNode = doc.child("xml");
XMLParse(output, tree, topNode, params);
}
void AlignedSentenceSyntax::CreateNonTerms()
{
for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) {
for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) {
ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd);
const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd);
for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) {
for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) {
ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd);
const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd);
ConsistentPhrases::Coll::iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
ConsistentPhrase &cp = **iter;
ConsistentPhrases::Coll::iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
ConsistentPhrase &cp = **iter;
int targetStart = cp.corners[2];
int targetEnd = cp.corners[3];
const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd);
int targetStart = cp.corners[2];
int targetEnd = cp.corners[3];
const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd);
CreateNonTerms(cp, sourceLabels, targetLabels);
}
}
}
CreateNonTerms(cp, sourceLabels, targetLabels);
}
}
}
}
void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp,
const SyntaxTree::Labels &sourceLabels,
const SyntaxTree::Labels &targetLabels)
const SyntaxTree::Labels &sourceLabels,
const SyntaxTree::Labels &targetLabels)
{
SyntaxTree::Labels::const_iterator iterSource;
for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) {
const string &sourceLabel = *iterSource;
SyntaxTree::Labels::const_iterator iterSource;
for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) {
const string &sourceLabel = *iterSource;
SyntaxTree::Labels::const_iterator iterTarget;
for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) {
const string &targetLabel = *iterTarget;
cp.AddNonTerms(sourceLabel, targetLabel);
}
}
SyntaxTree::Labels::const_iterator iterTarget;
for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) {
const string &targetLabel = *iterTarget;
cp.AddNonTerms(sourceLabel, targetLabel);
}
}
}

View File

@ -14,33 +14,33 @@
class AlignedSentenceSyntax : public AlignedSentence
{
public:
AlignedSentenceSyntax(int lineNum,
const std::string &source,
const std::string &target,
const std::string &alignment);
virtual ~AlignedSentenceSyntax();
AlignedSentenceSyntax(int lineNum,
const std::string &source,
const std::string &target,
const std::string &alignment);
virtual ~AlignedSentenceSyntax();
void Create(const Parameter &params);
void Create(const Parameter &params);
//virtual std::string Debug() const;
//virtual std::string Debug() const;
protected:
std::string m_sourceStr, m_targetStr, m_alignmentStr;
SyntaxTree m_sourceTree, m_targetTree;
std::string m_sourceStr, m_targetStr, m_alignmentStr;
SyntaxTree m_sourceTree, m_targetTree;
void XMLParse(Phrase &output,
SyntaxTree &tree,
const std::string input,
const Parameter &params);
void XMLParse(Phrase &output,
SyntaxTree &tree,
const pugi::xml_node &parentNode,
const Parameter &params);
void CreateNonTerms();
void CreateNonTerms(ConsistentPhrase &cp,
const SyntaxTree::Labels &sourceLabels,
const SyntaxTree::Labels &targetLabels);
void Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
std::string line, Phrase &phrase, SyntaxTree &tree);
void XMLParse(Phrase &output,
SyntaxTree &tree,
const std::string input,
const Parameter &params);
void XMLParse(Phrase &output,
SyntaxTree &tree,
const pugi::xml_node &parentNode,
const Parameter &params);
void CreateNonTerms();
void CreateNonTerms(ConsistentPhrase &cp,
const SyntaxTree::Labels &sourceLabels,
const SyntaxTree::Labels &targetLabels);
void Populate(bool isSyntax, int mixedSyntaxType, const Parameter &params,
std::string line, Phrase &phrase, SyntaxTree &tree);
};

View File

@ -13,21 +13,20 @@
using namespace std;
ConsistentPhrase::ConsistentPhrase(
int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params)
:corners(4)
,m_hieroNonTerm(*this, params.hieroNonTerm, params.hieroNonTerm)
int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params)
:corners(4)
,m_hieroNonTerm(*this, params.hieroNonTerm, params.hieroNonTerm)
{
corners[0] = sourceStart;
corners[1] = sourceEnd;
corners[2] = targetStart;
corners[3] = targetEnd;
corners[0] = sourceStart;
corners[1] = sourceEnd;
corners[2] = targetStart;
corners[3] = targetEnd;
}
ConsistentPhrase::~ConsistentPhrase()
{
// TODO Auto-generated destructor stub
ConsistentPhrase::~ConsistentPhrase() {
// TODO Auto-generated destructor stub
}
bool ConsistentPhrase::operator<(const ConsistentPhrase &other) const
@ -36,29 +35,29 @@ bool ConsistentPhrase::operator<(const ConsistentPhrase &other) const
}
void ConsistentPhrase::AddNonTerms(const std::string &source,
const std::string &target)
const std::string &target)
{
m_nonTerms.push_back(NonTerm(*this, source, target));
m_nonTerms.push_back(NonTerm(*this, source, target));
}
bool ConsistentPhrase::TargetOverlap(const ConsistentPhrase &other) const
{
if ( other.corners[3] < corners[2] || other.corners[2] > corners[3])
return false;
if ( other.corners[3] < corners[2] || other.corners[2] > corners[3])
return false;
return true;
return true;
}
std::string ConsistentPhrase::Debug() const
{
stringstream out;
out << "[" << corners[0] << "-" << corners[1]
<< "][" << corners[2] << "-" << corners[3] << "]";
<< "][" << corners[2] << "-" << corners[3] << "]";
out << "NT:";
for (size_t i = 0; i < m_nonTerms.size(); ++i) {
const NonTerm &nonTerm = m_nonTerms[i];
out << nonTerm.GetLabel(Moses::Input) << ":" << nonTerm.GetLabel(Moses::Output);
const NonTerm &nonTerm = m_nonTerms[i];
out << nonTerm.GetLabel(Moses::Input) << ":" << nonTerm.GetLabel(Moses::Output);
}
return out.str();

View File

@ -16,32 +16,29 @@
class ConsistentPhrase
{
public:
typedef std::vector<NonTerm> NonTerms;
typedef std::vector<NonTerm> NonTerms;
std::vector<int> corners;
std::vector<int> corners;
ConsistentPhrase(const ConsistentPhrase &copy); // do not implement
ConsistentPhrase(int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params);
ConsistentPhrase(const ConsistentPhrase &copy); // do not implement
ConsistentPhrase(int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params);
virtual ~ConsistentPhrase();
virtual ~ConsistentPhrase();
int GetWidth(Moses::FactorDirection direction) const {
return (direction == Moses::Input) ? corners[1] - corners[0] + 1 : corners[3] - corners[2] + 1;
}
int GetWidth(Moses::FactorDirection direction) const
{ return (direction == Moses::Input) ? corners[1] - corners[0] + 1 : corners[3] - corners[2] + 1; }
void AddNonTerms(const std::string &source,
const std::string &target);
const NonTerms &GetNonTerms() const {
return m_nonTerms;
}
const NonTerm &GetHieroNonTerm() const {
return m_hieroNonTerm;
}
void AddNonTerms(const std::string &source,
const std::string &target);
const NonTerms &GetNonTerms() const
{ return m_nonTerms;}
const NonTerm &GetHieroNonTerm() const
{ return m_hieroNonTerm;}
bool TargetOverlap(const ConsistentPhrase &other) const;
bool TargetOverlap(const ConsistentPhrase &other) const;
bool operator<(const ConsistentPhrase &other) const;

View File

@ -17,36 +17,35 @@ ConsistentPhrases::ConsistentPhrases()
{
}
ConsistentPhrases::~ConsistentPhrases()
{
for (int start = 0; start < m_coll.size(); ++start) {
std::vector<Coll> &allSourceStart = m_coll[start];
ConsistentPhrases::~ConsistentPhrases() {
for (int start = 0; start < m_coll.size(); ++start) {
std::vector<Coll> &allSourceStart = m_coll[start];
for (int size = 0; size < allSourceStart.size(); ++size) {
Coll &coll = allSourceStart[size];
Moses::RemoveAllInColl(coll);
}
}
for (int size = 0; size < allSourceStart.size(); ++size) {
Coll &coll = allSourceStart[size];
Moses::RemoveAllInColl(coll);
}
}
}
void ConsistentPhrases::Initialize(size_t size)
{
m_coll.resize(size);
m_coll.resize(size);
for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
std::vector<Coll> &allSourceStart = m_coll[sourceStart];
allSourceStart.resize(size - sourceStart);
}
for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
std::vector<Coll> &allSourceStart = m_coll[sourceStart];
allSourceStart.resize(size - sourceStart);
}
}
void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params)
int targetStart, int targetEnd,
const Parameter &params)
{
Coll &coll = m_coll[sourceStart][sourceEnd - sourceStart];
ConsistentPhrase *cp = new ConsistentPhrase(sourceStart, sourceEnd,
targetStart, targetEnd,
params);
targetStart, targetEnd,
params);
pair<Coll::iterator, bool> inserted = coll.insert(cp);
assert(inserted.second);
@ -54,51 +53,51 @@ void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
const ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd) const
{
const std::vector<Coll> &allSourceStart = m_coll[sourceStart];
const Coll &ret = allSourceStart[sourceEnd - sourceStart];
return ret;
const std::vector<Coll> &allSourceStart = m_coll[sourceStart];
const Coll &ret = allSourceStart[sourceEnd - sourceStart];
return ret;
}
ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd)
{
std::vector<Coll> &allSourceStart = m_coll[sourceStart];
Coll &ret = allSourceStart[sourceEnd - sourceStart];
return ret;
std::vector<Coll> &allSourceStart = m_coll[sourceStart];
Coll &ret = allSourceStart[sourceEnd - sourceStart];
return ret;
}
std::string ConsistentPhrases::Debug() const
{
std::stringstream out;
for (int start = 0; start < m_coll.size(); ++start) {
const std::vector<Coll> &allSourceStart = m_coll[start];
std::stringstream out;
for (int start = 0; start < m_coll.size(); ++start) {
const std::vector<Coll> &allSourceStart = m_coll[start];
for (int size = 0; size < allSourceStart.size(); ++size) {
const Coll &coll = allSourceStart[size];
for (int size = 0; size < allSourceStart.size(); ++size) {
const Coll &coll = allSourceStart[size];
Coll::const_iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
const ConsistentPhrase &consistentPhrase = **iter;
out << consistentPhrase.Debug() << endl;
}
}
}
Coll::const_iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
const ConsistentPhrase &consistentPhrase = **iter;
out << consistentPhrase.Debug() << endl;
}
}
}
return out.str();
return out.str();
}
void ConsistentPhrases::AddHieroNonTerms(const Parameter &params)
{
// add [X] labels everywhere
for (int i = 0; i < m_coll.size(); ++i) {
vector<Coll> &inner = m_coll[i];
for (int j = 0; j < inner.size(); ++j) {
ConsistentPhrases::Coll &coll = inner[j];
ConsistentPhrases::Coll::iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
ConsistentPhrase &cp = **iter;
cp.AddNonTerms(params.hieroNonTerm, params.hieroNonTerm);
}
}
}
// add [X] labels everywhere
for (int i = 0; i < m_coll.size(); ++i) {
vector<Coll> &inner = m_coll[i];
for (int j = 0; j < inner.size(); ++j) {
ConsistentPhrases::Coll &coll = inner[j];
ConsistentPhrases::Coll::iterator iter;
for (iter = coll.begin(); iter != coll.end(); ++iter) {
ConsistentPhrase &cp = **iter;
cp.AddNonTerms(params.hieroNonTerm, params.hieroNonTerm);
}
}
}
}

View File

@ -14,28 +14,27 @@
class Word;
class Parameter;
class ConsistentPhrases
{
class ConsistentPhrases {
public:
typedef std::set<ConsistentPhrase*> Coll;
typedef std::set<ConsistentPhrase*> Coll;
ConsistentPhrases();
virtual ~ConsistentPhrases();
ConsistentPhrases();
virtual ~ConsistentPhrases();
void Initialize(size_t size);
void Initialize(size_t size);
void Add(int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params);
void Add(int sourceStart, int sourceEnd,
int targetStart, int targetEnd,
const Parameter &params);
void AddHieroNonTerms(const Parameter &params);
void AddHieroNonTerms(const Parameter &params);
const Coll &GetColl(int sourceStart, int sourceEnd) const;
Coll &GetColl(int sourceStart, int sourceEnd);
const Coll &GetColl(int sourceStart, int sourceEnd) const;
Coll &GetColl(int sourceStart, int sourceEnd);
std::string Debug() const;
std::string Debug() const;
protected:
std::vector< std::vector<Coll> > m_coll;
std::vector< std::vector<Coll> > m_coll;
};

View File

@ -3,17 +3,17 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -27,35 +27,36 @@ using namespace std;
namespace Moses
{
InputFileStream::InputFileStream(const std::string &filePath)
: std::istream(NULL)
, m_streambuf(NULL)
{
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz") {
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
fb = fb->open(filePath.c_str(), std::ios::in);
if (! fb) {
cerr << "Can't read " << filePath.c_str() << endl;
exit(1);
}
m_streambuf = fb;
}
this->init(m_streambuf);
}
InputFileStream::~InputFileStream()
{
delete m_streambuf;
m_streambuf = NULL;
}
void InputFileStream::Close()
{
}
InputFileStream::InputFileStream(const std::string &filePath)
: std::istream(NULL)
, m_streambuf(NULL)
{
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz")
{
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
fb = fb->open(filePath.c_str(), std::ios::in);
if (! fb) {
cerr << "Can't read " << filePath.c_str() << endl;
exit(1);
}
m_streambuf = fb;
}
this->init(m_streambuf);
}
InputFileStream::~InputFileStream()
{
delete m_streambuf;
m_streambuf = NULL;
}
void InputFileStream::Close()
{
}
}

View File

@ -3,17 +3,17 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -28,21 +28,21 @@
namespace Moses
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class InputFileStream : public std::istream
{
protected:
std::streambuf *m_streambuf;
public:
InputFileStream(const std::string &filePath);
~InputFileStream();
void Close();
};
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class InputFileStream : public std::istream
{
protected:
std::streambuf *m_streambuf;
public:
InputFileStream(const std::string &filePath);
~InputFileStream();
void Close();
};
}
#endif

View File

@ -23,59 +23,64 @@ int main(int argc, char** argv)
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source")
("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.")
("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here")
("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used")
("GZOutput", "Compress extract files")
("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule")
("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint")
("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.")
("MinHoleSourceSyntax", po::value<int>()->default_value(params.minHoleSourceSyntax), "Minimum source span for a syntactic non-term (source or target).")
("help", "Print help messages")
("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source")
("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.")
("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here")
("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used")
("GZOutput", "Compress extract files")
("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule")
("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint")
("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.")
("MinHoleSourceSyntax", po::value<int>()->default_value(params.minHoleSourceSyntax), "Minimum source span for a syntactic non-term (source or target).")
("SourceSyntax", "Source sentence is a parse tree")
("TargetSyntax", "Target sentence is a parse tree")
("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most")
("HieroSourceLHS", "Always use Hiero source LHS? Default = 0")
("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)")
("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals")
("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH")
("MinScope", po::value<int>()->default_value(params.minScope), "min scope.")
("SourceSyntax", "Source sentence is a parse tree")
("TargetSyntax", "Target sentence is a parse tree")
("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most")
("HieroSourceLHS", "Always use Hiero source LHS? Default = 0")
("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)")
("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals")
("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH")
("MinScope", po::value<int>()->default_value(params.minScope), "min scope.")
("SpanLength", "Property - span length of RHS each non-term")
("SpanLength", "Property - span length of each LHS non-term")
("RuleLength", "Property - length of entire rule. Only for rules with NTs")
("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ")
("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term")
("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.")
("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ")
("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term")
("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.")
("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.")
("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.")
("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.")
("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.")
("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket")
("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...")
("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket")
("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...")
("NonTermConsecSource", "Allow consecutive non-terms on the source side")
("NonTermConsecSourceMixedSyntax", po::value<int>()->default_value(params.nonTermConsecSourceMixedSyntax), "In mixed syntax mode, what nt can be consecutive. 0=don't allow consec nt. 1(default)=hiero+syntax. 2=syntax+syntax. 3=always allow");
("NonTermConsecSource", "Allow consecutive non-terms on the source side")
("NonTermConsecSourceMixedSyntax", po::value<int>()->default_value(params.nonTermConsecSourceMixedSyntax), "In mixed syntax mode, what nt can be consecutive. 0=don't allow consec nt. 1(default)=hiero+syntax. 2=syntax+syntax. 3=always allow");
po::variables_map vm;
try {
try
{
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help") || argc < 5 ) {
if ( vm.count("help") || argc < 5 )
{
std::cout << argv[0] << " target source alignment [options...]" << std::endl
<< desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
} catch(po::error& e) {
// there are any problems
}
catch(po::error& e)
{
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
@ -103,6 +108,7 @@ int main(int argc, char** argv)
// properties
if (vm.count("SpanLength")) params.spanLength = true;
if (vm.count("RuleLength")) params.ruleLength = true;
if (vm.count("NonTermContext")) params.nonTermContext = true;
if (vm.count("NonTermContextTarget")) params.nonTermContextTarget = true;
if (vm.count("NonTermContextFactor")) params.nonTermContextFactor = vm["NonTermContextFactor"].as<int>();
@ -112,7 +118,7 @@ int main(int argc, char** argv)
if (vm.count("HieroNonTerm")) params.hieroNonTerm = vm["HieroNonTerm"].as<string>();
if (vm.count("ScopeSpan")) {
params.SetScopeSpan(vm["ScopeSpan"].as<string>());
params.SetScopeSpan(vm["ScopeSpan"].as<string>());
}
if (vm.count("NonTermConsecSource")) params.nonTermConsecSource = true;
@ -127,8 +133,8 @@ int main(int argc, char** argv)
string pathExtract = argv[4];
string pathExtractInv = pathExtract + ".inv";
if (params.gzOutput) {
pathExtract += ".gz";
pathExtractInv += ".gz";
pathExtract += ".gz";
pathExtractInv += ".gz";
}
Moses::InputFileStream strmTarget(pathTarget);
@ -142,53 +148,54 @@ int main(int argc, char** argv)
int lineNum = 1;
string lineTarget, lineSource, lineAlignment;
while (getline(strmTarget, lineTarget)) {
if (lineNum % 10000 == 0) {
cerr << lineNum << " ";
}
if (lineNum % 10000 == 0) {
cerr << lineNum << " ";
}
bool success;
success = getline(strmSource, lineSource);
if (!success) {
throw "Couldn't read source";
}
success = getline(strmAlignment, lineAlignment);
if (!success) {
throw "Couldn't read alignment";
}
bool success;
success = getline(strmSource, lineSource);
if (!success) {
throw "Couldn't read source";
}
success = getline(strmAlignment, lineAlignment);
if (!success) {
throw "Couldn't read alignment";
}
/*
cerr << "lineTarget=" << lineTarget << endl;
cerr << "lineSource=" << lineSource << endl;
cerr << "lineAlignment=" << lineAlignment << endl;
*/
/*
cerr << "lineTarget=" << lineTarget << endl;
cerr << "lineSource=" << lineSource << endl;
cerr << "lineAlignment=" << lineAlignment << endl;
*/
AlignedSentence *alignedSentence;
AlignedSentence *alignedSentence;
if (params.sourceSyntax || params.targetSyntax) {
alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment);
} else {
alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment);
}
if (params.sourceSyntax || params.targetSyntax) {
alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment);
}
else {
alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment);
}
alignedSentence->Create(params);
//cerr << alignedSentence->Debug();
alignedSentence->Create(params);
//cerr << alignedSentence->Debug();
Rules rules(*alignedSentence);
rules.Extend(params);
rules.Consolidate(params);
//cerr << rules.Debug();
Rules rules(*alignedSentence);
rules.Extend(params);
rules.Consolidate(params);
//cerr << rules.Debug();
rules.Output(extractFile, true, params);
rules.Output(extractInvFile, false, params);
rules.Output(extractFile, true, params);
rules.Output(extractInvFile, false, params);
delete alignedSentence;
delete alignedSentence;
++lineNum;
++lineNum;
}
if (!params.gluePath.empty()) {
Moses::OutputFileStream glueFile(params.gluePath);
CreateGlueGrammar(glueFile);
Moses::OutputFileStream glueFile(params.gluePath);
CreateGlueGrammar(glueFile);
}
cerr << "Finished" << endl;
@ -196,8 +203,8 @@ int main(int argc, char** argv)
void CreateGlueGrammar(Moses::OutputFileStream &glueFile)
{
glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
}

View File

@ -14,19 +14,18 @@
using namespace std;
NonTerm::NonTerm(const ConsistentPhrase &consistentPhrase,
const std::string &source,
const std::string &target)
:m_consistentPhrase(&consistentPhrase)
,m_source(source)
,m_target(target)
const std::string &source,
const std::string &target)
:m_consistentPhrase(&consistentPhrase)
,m_source(source)
,m_target(target)
{
// TODO Auto-generated constructor stub
// TODO Auto-generated constructor stub
}
NonTerm::~NonTerm()
{
// TODO Auto-generated destructor stub
NonTerm::~NonTerm() {
// TODO Auto-generated destructor stub
}
std::string NonTerm::Debug() const
@ -44,7 +43,7 @@ void NonTerm::Output(std::ostream &out) const
void NonTerm::Output(std::ostream &out, Moses::FactorDirection direction) const
{
out << GetLabel(direction);
out << GetLabel(direction);
}
const std::string &NonTerm::GetLabel(Moses::FactorDirection direction) const
@ -54,16 +53,14 @@ const std::string &NonTerm::GetLabel(Moses::FactorDirection direction) const
bool NonTerm::IsHiero(Moses::FactorDirection direction, const Parameter &params) const
{
const std::string &label = NonTerm::GetLabel(direction);
return label == params.hieroNonTerm;
const std::string &label = NonTerm::GetLabel(direction);
return label == params.hieroNonTerm;
}
bool NonTerm::IsHiero(const Parameter &params) const
{
return IsHiero(Moses::Input, params) && IsHiero(Moses::Output, params);
return IsHiero(Moses::Input, params) && IsHiero(Moses::Output, params);
}
int NonTerm::GetWidth(Moses::FactorDirection direction) const
{
return GetConsistentPhrase().GetWidth(direction);
}
{ return GetConsistentPhrase().GetWidth(direction); }

View File

@ -16,27 +16,24 @@ class NonTerm : public RuleSymbol
{
public:
NonTerm(const ConsistentPhrase &consistentPhrase,
const std::string &source,
const std::string &target);
virtual ~NonTerm();
NonTerm(const ConsistentPhrase &consistentPhrase,
const std::string &source,
const std::string &target);
virtual ~NonTerm();
const ConsistentPhrase &GetConsistentPhrase() const {
return *m_consistentPhrase;
}
const ConsistentPhrase &GetConsistentPhrase() const
{ return *m_consistentPhrase; }
int GetWidth(Moses::FactorDirection direction) const;
int GetWidth(Moses::FactorDirection direction) const;
virtual bool IsNonTerm() const {
return true;
}
virtual bool IsNonTerm() const
{ return true; }
std::string GetString() const {
return m_source + m_target;
}
std::string GetString() const
{ return m_source + m_target; }
virtual std::string Debug() const;
virtual void Output(std::ostream &out) const;
virtual std::string Debug() const;
virtual void Output(std::ostream &out) const;
void Output(std::ostream &out, Moses::FactorDirection direction) const;
const std::string &GetLabel(Moses::FactorDirection direction) const;
@ -44,7 +41,7 @@ public:
bool IsHiero(const Parameter &params) const;
protected:
const ConsistentPhrase *m_consistentPhrase;
std::string m_source, m_target;
const ConsistentPhrase *m_consistentPhrase;
std::string m_source, m_target;
};

View File

@ -11,63 +11,63 @@
using namespace std;
Parameter::Parameter()
:maxSpan(10)
,minSpan(0)
,maxNonTerm(2)
,maxHieroNonTerm(999)
,maxSymbolsTarget(999)
,maxSymbolsSource(5)
,minHoleSource(2)
,minHoleSourceSyntax(1)
,sentenceOffset(0)
,nonTermConsecSource(false)
,requireAlignedWord(true)
,fractionalCounting(true)
,gzOutput(false)
:maxSpan(10)
,minSpan(0)
,maxNonTerm(2)
,maxHieroNonTerm(999)
,maxSymbolsTarget(999)
,maxSymbolsSource(5)
,minHoleSource(2)
,minHoleSourceSyntax(1)
,sentenceOffset(0)
,nonTermConsecSource(false)
,requireAlignedWord(true)
,fractionalCounting(true)
,gzOutput(false)
,hieroNonTerm("[X]")
,sourceSyntax(false)
,targetSyntax(false)
,hieroNonTerm("[X]")
,sourceSyntax(false)
,targetSyntax(false)
,mixedSyntaxType(0)
,multiLabel(0)
,nonTermConsecSourceMixed(true)
,hieroSourceLHS(false)
,maxSpanFreeNonTermSource(0)
,nieceTerminal(true)
,maxScope(UNDEFINED)
,minScope(0)
,mixedSyntaxType(0)
,multiLabel(0)
,nonTermConsecSourceMixed(true)
,hieroSourceLHS(false)
,maxSpanFreeNonTermSource(0)
,nieceTerminal(true)
,maxScope(UNDEFINED)
,minScope(0)
,spanLength(false)
,nonTermContext(false)
,nonTermContextTarget(false)
,nonTermContextFactor(0)
,spanLength(false)
,ruleLength(false)
,nonTermContext(false)
,nonTermContextTarget(false)
,nonTermContextFactor(0)
,numSourceFactors(1)
,numTargetFactors(1)
,numSourceFactors(1)
,numTargetFactors(1)
,nonTermConsecSourceMixedSyntax(1)
,nonTermConsecSourceMixedSyntax(1)
{}
Parameter::~Parameter()
{
// TODO Auto-generated destructor stub
Parameter::~Parameter() {
// TODO Auto-generated destructor stub
}
void Parameter::SetScopeSpan(const std::string &str)
{
scopeSpanStr = str;
vector<string> toks1;
Moses::Tokenize(toks1, str, ":");
scopeSpanStr = str;
vector<string> toks1;
Moses::Tokenize(toks1, str, ":");
for (size_t i = 0; i < toks1.size(); ++i) {
const string &tok1 = toks1[i];
for (size_t i = 0; i < toks1.size(); ++i) {
const string &tok1 = toks1[i];
vector<int> toks2;
Moses::Tokenize<int>(toks2, tok1, ",");
UTIL_THROW_IF2(toks2.size() != 2, "Format is min,max:min,max... String is " << tok1);
vector<int> toks2;
Moses::Tokenize<int>(toks2, tok1, ",");
UTIL_THROW_IF2(toks2.size() != 2, "Format is min,max:min,max... String is " << tok1);
std::pair<int,int> values(toks2[0], toks2[1]);
scopeSpan.push_back(values);
}
std::pair<int,int> values(toks2[0], toks2[1]);
scopeSpan.push_back(values);
}
}

View File

@ -48,6 +48,7 @@ public:
// properties
bool spanLength;
bool ruleLength;
bool nonTermContext;
bool nonTermContextTarget;
int nonTermContextFactor;

View File

@ -3,12 +3,12 @@
std::string Phrase::Debug() const
{
std::stringstream out;
std::stringstream out;
for (size_t i = 0; i < size(); ++i) {
Word &word = *at(i);
out << word.Debug() << " ";
}
for (size_t i = 0; i < size(); ++i) {
Word &word = *at(i);
out << word.Debug() << " ";
}
return out.str();
return out.str();
}

View File

@ -7,13 +7,13 @@
class Phrase : public std::vector<Word*>
{
public:
Phrase() {
}
Phrase()
{}
Phrase(size_t size)
:std::vector<Word*>(size) {
}
Phrase(size_t size)
:std::vector<Word*>(size)
{}
std::string Debug() const;
std::string Debug() const;
};

View File

@ -0,0 +1,2 @@
#include "Property.h"

View File

@ -0,0 +1,14 @@
#pragma once
#include <string>
typedef std::string Property;
/*
class Property
{
public:
std::string str;
};
*/

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@
#include <vector>
#include "Phrase.h"
#include "RulePhrase.h"
#include "Property.h"
#include "moses/TypeDef.h"
class ConsistentPhrase;
@ -16,83 +17,83 @@ class NonTerm;
class Parameter;
class Rule
{
class Rule {
public:
typedef std::set<std::pair<int,int> > Alignments;
typedef std::set<std::pair<int,int> > Alignments;
typedef std::vector<Property> Properties;
Rule(const Rule &copy); // do not implement
Rule(const Rule &copy); // do not implement
// original rule with no non-term
Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence);
// original rule with no non-term
Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence);
// extend a rule, adding 1 new non-term
Rule(const Rule &copy, const NonTerm &nonTerm);
// extend a rule, adding 1 new non-term
Rule(const Rule &copy, const NonTerm &nonTerm);
virtual ~Rule();
virtual ~Rule();
bool IsValid() const {
return m_isValid;
}
bool IsValid() const
{ return m_isValid; }
bool CanRecurse() const {
return m_canRecurse;
}
bool CanRecurse() const
{ return m_canRecurse; }
const NonTerm &GetLHS() const {
return m_lhs;
}
const NonTerm &GetLHS() const
{ return m_lhs; }
const ConsistentPhrase &GetConsistentPhrase() const;
const ConsistentPhrase &GetConsistentPhrase() const;
int GetNextSourcePosForNonTerm() const;
int GetNextSourcePosForNonTerm() const;
void SetCount(float count) {
m_count = count;
}
float GetCount() const {
return m_count;
}
void SetCount(float count)
{ m_count = count; }
float GetCount() const
{ return m_count; }
const Alignments &GetAlignments() const {
return m_alignments;
}
const Alignments &GetAlignments() const
{ return m_alignments; }
std::string Debug() const;
void Output(std::ostream &out, bool forward, const Parameter &params) const;
const Properties &GetProperties() const
{ return m_properties; }
void Prevalidate(const Parameter &params);
void CreateTarget(const Parameter &params);
std::string Debug() const;
void Output(std::ostream &out, bool forward) const;
const RulePhrase &GetPhrase(Moses::FactorDirection direction) const {
return (direction == Moses::Input) ? m_source : m_target;
}
void Prevalidate(const Parameter &params);
void CreateTarget(const Parameter &params);
void CreateProperties(const Parameter &params);
const RulePhrase &GetPhrase(Moses::FactorDirection direction) const
{ return (direction == Moses::Input) ? m_source : m_target; }
protected:
const NonTerm &m_lhs;
const AlignedSentence &m_alignedSentence;
RulePhrase m_source, m_target;
float m_count;
const NonTerm &m_lhs;
const AlignedSentence &m_alignedSentence;
RulePhrase m_source, m_target;
float m_count;
Alignments m_alignments;
Alignments m_alignments;
// in source order
std::vector<const NonTerm*> m_nonterms;
// in source order
std::vector<const NonTerm*> m_nonterms;
bool m_isValid, m_canRecurse;
bool m_isValid, m_canRecurse;
void CreateSource();
void CreateAlignments();
void CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords);
void CreateAlignments(int sourcePos, const RuleSymbol *targetSought);
// should be in consistent order, for comparisons
Properties m_properties;
bool ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const;
int GetScope(const Parameter &params) const;
void CreateSource();
void CreateAlignments();
void CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords);
void CreateAlignments(int sourcePos, const RuleSymbol *targetSought);
void NonTermContext(int sourceTarget, int factors, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const;
// sourceTarget: 1 = source, 2 = target
bool ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const;
int GetScope(const Parameter &params) const;
void NonTermContextFactor(int factor, const Word &word, std::ostream &out) const;
void NonTermContext(int sourceTarget, int factors, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const;
// sourceTarget: 1 = source, 2 = target
void NonTermContextFactor(int factor, const Word &word, std::ostream &out) const;
};

View File

@ -16,17 +16,17 @@ extern bool g_debug;
int RulePhrase::Compare(const RulePhrase &other) const
{
if (GetSize() != other.GetSize()) {
return GetSize() < other.GetSize() ? -1 : +1;
return GetSize() < other.GetSize() ? -1 : +1;
}
for (size_t i = 0; i < m_coll.size(); ++i) {
const RuleSymbol &symbol = *m_coll[i];
const RuleSymbol &otherSymbol = *other.m_coll[i];
int compare = symbol.Compare(otherSymbol);
const RuleSymbol &symbol = *m_coll[i];
const RuleSymbol &otherSymbol = *other.m_coll[i];
int compare = symbol.Compare(otherSymbol);
if (compare) {
return compare;
}
if (compare) {
return compare;
}
}
return 0;
@ -35,16 +35,16 @@ int RulePhrase::Compare(const RulePhrase &other) const
void RulePhrase::Output(std::ostream &out) const
{
for (size_t i = 0; i < m_coll.size(); ++i) {
const RuleSymbol &symbol = *m_coll[i];
symbol.Output(out);
out << " ";
const RuleSymbol &symbol = *m_coll[i];
symbol.Output(out);
out << " ";
}
}
std::string RulePhrase::Debug() const
{
std::stringstream out;
Output(out);
return out.str();
std::stringstream out;
Output(out);
return out.str();
}

View File

@ -21,12 +21,12 @@ public:
typedef std::vector<const RuleSymbol*> Coll;
Coll m_coll;
size_t GetSize() const {
return m_coll.size();
}
size_t GetSize() const
{ return m_coll.size(); }
void Add(const RuleSymbol *symbol) {
m_coll.push_back(symbol);
void Add(const RuleSymbol *symbol)
{
m_coll.push_back(symbol);
}
const RuleSymbol* operator[](size_t index) const {

View File

@ -9,29 +9,28 @@
using namespace std;
RuleSymbol::RuleSymbol()
{
// TODO Auto-generated constructor stub
RuleSymbol::RuleSymbol() {
// TODO Auto-generated constructor stub
}
RuleSymbol::~RuleSymbol()
{
// TODO Auto-generated destructor stub
RuleSymbol::~RuleSymbol() {
// TODO Auto-generated destructor stub
}
int RuleSymbol::Compare(const RuleSymbol &other) const
{
if (IsNonTerm() != other.IsNonTerm()) {
return IsNonTerm() ? -1 : +1;
}
if (IsNonTerm() != other.IsNonTerm()) {
return IsNonTerm() ? -1 : +1;
}
string str = GetString();
string otherStr = other.GetString();
string str = GetString();
string otherStr = other.GetString();
if (str == otherStr) {
return 0;
} else {
return (str < otherStr) ? -1 : +1;
}
if (str == otherStr) {
return 0;
}
else {
return (str < otherStr) ? -1 : +1;
}
}

View File

@ -12,20 +12,19 @@
#include <string>
// base class - terminal or non-term
class RuleSymbol
{
class RuleSymbol {
public:
RuleSymbol();
virtual ~RuleSymbol();
RuleSymbol();
virtual ~RuleSymbol();
virtual bool IsNonTerm() const = 0;
virtual bool IsNonTerm() const = 0;
virtual std::string Debug() const = 0;
virtual void Output(std::ostream &out) const = 0;
virtual std::string Debug() const = 0;
virtual void Output(std::ostream &out) const = 0;
virtual std::string GetString() const = 0;
virtual std::string GetString() const = 0;
int Compare(const RuleSymbol &other) const;
int Compare(const RuleSymbol &other) const;
};

View File

@ -19,177 +19,181 @@ using namespace std;
extern bool g_debug;
Rules::Rules(const AlignedSentence &alignedSentence)
:m_alignedSentence(alignedSentence)
:m_alignedSentence(alignedSentence)
{
}
Rules::~Rules()
{
Moses::RemoveAllInColl(m_keepRules);
Rules::~Rules() {
Moses::RemoveAllInColl(m_keepRules);
}
void Rules::CreateRules(const ConsistentPhrase &cp,
const Parameter &params)
const Parameter &params)
{
if (params.hieroSourceLHS) {
const NonTerm &nonTerm = cp.GetHieroNonTerm();
CreateRule(nonTerm, params);
} else {
const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
for (size_t i = 0; i < nonTerms.size(); ++i) {
const NonTerm &nonTerm = nonTerms[i];
CreateRule(nonTerm, params);
}
}
if (params.hieroSourceLHS) {
const NonTerm &nonTerm = cp.GetHieroNonTerm();
CreateRule(nonTerm, params);
}
else {
const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
for (size_t i = 0; i < nonTerms.size(); ++i) {
const NonTerm &nonTerm = nonTerms[i];
CreateRule(nonTerm, params);
}
}
}
void Rules::CreateRule(const NonTerm &nonTerm,
const Parameter &params)
const Parameter &params)
{
Rule *rule = new Rule(nonTerm, m_alignedSentence);
Rule *rule = new Rule(nonTerm, m_alignedSentence);
rule->Prevalidate(params);
rule->CreateTarget(params);
rule->Prevalidate(params);
rule->CreateTarget(params);
rule->CreateProperties(params);
if (rule->CanRecurse()) {
Extend(*rule, params);
}
if (rule->CanRecurse()) {
Extend(*rule, params);
}
if (rule->IsValid()) {
m_keepRules.insert(rule);
} else {
delete rule;
}
if (rule->IsValid()) {
m_keepRules.insert(rule);
}
else {
delete rule;
}
}
void Rules::Extend(const Parameter &params)
{
const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
size_t size = m_alignedSentence.GetPhrase(Moses::Input).size();
for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
for (size_t sourceEnd = sourceStart; sourceEnd < size; ++sourceEnd) {
const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
size_t size = m_alignedSentence.GetPhrase(Moses::Input).size();
for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
for (size_t sourceEnd = sourceStart; sourceEnd < size; ++sourceEnd) {
const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
ConsistentPhrases::Coll::const_iterator iter;
for (iter = cps.begin(); iter != cps.end(); ++iter) {
const ConsistentPhrase &cp = **iter;
CreateRules(cp, params);
}
}
}
ConsistentPhrases::Coll::const_iterator iter;
for (iter = cps.begin(); iter != cps.end(); ++iter) {
const ConsistentPhrase &cp = **iter;
CreateRules(cp, params);
}
}
}
}
void Rules::Extend(const Rule &rule, const Parameter &params)
{
const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
int sourceMin = rule.GetNextSourcePosForNonTerm();
const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
int sourceMin = rule.GetNextSourcePosForNonTerm();
int ruleStart = rule.GetConsistentPhrase().corners[0];
int ruleEnd = rule.GetConsistentPhrase().corners[1];
int ruleStart = rule.GetConsistentPhrase().corners[0];
int ruleEnd = rule.GetConsistentPhrase().corners[1];
for (int sourceStart = sourceMin; sourceStart <= ruleEnd; ++sourceStart) {
for (int sourceEnd = sourceStart; sourceEnd <= ruleEnd; ++sourceEnd) {
if (sourceStart == ruleStart && sourceEnd == ruleEnd) {
// don't cover whole rule with 1 non-term
continue;
}
for (int sourceStart = sourceMin; sourceStart <= ruleEnd; ++sourceStart) {
for (int sourceEnd = sourceStart; sourceEnd <= ruleEnd; ++sourceEnd) {
if (sourceStart == ruleStart && sourceEnd == ruleEnd) {
// don't cover whole rule with 1 non-term
continue;
}
const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
Extend(rule, cps, params);
}
}
const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
Extend(rule, cps, params);
}
}
}
void Rules::Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params)
{
ConsistentPhrases::Coll::const_iterator iter;
for (iter = cps.begin(); iter != cps.end(); ++iter) {
const ConsistentPhrase &cp = **iter;
Extend(rule, cp, params);
}
ConsistentPhrases::Coll::const_iterator iter;
for (iter = cps.begin(); iter != cps.end(); ++iter) {
const ConsistentPhrase &cp = **iter;
Extend(rule, cp, params);
}
}
void Rules::Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params)
{
const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
for (size_t i = 0; i < nonTerms.size(); ++i) {
const NonTerm &nonTerm = nonTerms[i];
const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
for (size_t i = 0; i < nonTerms.size(); ++i) {
const NonTerm &nonTerm = nonTerms[i];
Rule *newRule = new Rule(rule, nonTerm);
newRule->Prevalidate(params);
newRule->CreateTarget(params);
Rule *newRule = new Rule(rule, nonTerm);
newRule->Prevalidate(params);
newRule->CreateTarget(params);
newRule->CreateProperties(params);
if (newRule->CanRecurse()) {
// recursively extend
Extend(*newRule, params);
}
if (newRule->CanRecurse()) {
// recursively extend
Extend(*newRule, params);
}
if (newRule->IsValid()) {
m_keepRules.insert(newRule);
} else {
delete newRule;
}
}
if (newRule->IsValid()) {
m_keepRules.insert(newRule);
}
else {
delete newRule;
}
}
}
std::string Rules::Debug() const
{
stringstream out;
stringstream out;
std::set<Rule*>::const_iterator iter;
out << "m_keepRules:" << endl;
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
const Rule &rule = **iter;
out << rule.Debug() << endl;
}
std::set<Rule*>::const_iterator iter;
out << "m_keepRules:" << endl;
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
const Rule &rule = **iter;
out << rule.Debug() << endl;
}
return out.str();
return out.str();
}
void Rules::Output(std::ostream &out, bool forward, const Parameter &params) const
{
std::set<Rule*, CompareRules>::const_iterator iter;
for (iter = m_mergeRules.begin(); iter != m_mergeRules.end(); ++iter) {
const Rule &rule = **iter;
rule.Output(out, forward, params);
out << endl;
}
std::set<Rule*, CompareRules>::const_iterator iter;
for (iter = m_mergeRules.begin(); iter != m_mergeRules.end(); ++iter) {
const Rule &rule = **iter;
rule.Output(out, forward);
out << endl;
}
}
void Rules::Consolidate(const Parameter &params)
{
if (params.fractionalCounting) {
CalcFractionalCount();
} else {
std::set<Rule*>::iterator iter;
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
Rule &rule = **iter;
rule.SetCount(1);
}
}
if (params.fractionalCounting) {
CalcFractionalCount();
}
else {
std::set<Rule*>::iterator iter;
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
Rule &rule = **iter;
rule.SetCount(1);
}
}
MergeRules(params);
MergeRules(params);
}
void Rules::MergeRules(const Parameter &params)
{
typedef std::set<Rule*, CompareRules> MergeRules;
typedef std::set<Rule*, CompareRules> MergeRules;
std::set<Rule*>::const_iterator iterOrig;
for (iterOrig = m_keepRules.begin(); iterOrig != m_keepRules.end(); ++iterOrig) {
Rule *origRule = *iterOrig;
std::set<Rule*>::const_iterator iterOrig;
for (iterOrig = m_keepRules.begin(); iterOrig != m_keepRules.end(); ++iterOrig) {
Rule *origRule = *iterOrig;
pair<MergeRules::iterator, bool> inserted = m_mergeRules.insert(origRule);
if (!inserted.second) {
// already there, just add count
Rule &rule = **inserted.first;
float newCount = rule.GetCount() + origRule->GetCount();
rule.SetCount(newCount);
}
}
pair<MergeRules::iterator, bool> inserted = m_mergeRules.insert(origRule);
if (!inserted.second) {
// already there, just add count
Rule &rule = **inserted.first;
float newCount = rule.GetCount() + origRule->GetCount();
rule.SetCount(newCount);
}
}
}
void Rules::CalcFractionalCount()
@ -201,22 +205,22 @@ void Rules::CalcFractionalCount()
// sort by source AND target ranges
std::set<Rule*>::const_iterator iter;
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
Rule *rule = *iter;
const ConsistentPhrase &cp = rule->GetConsistentPhrase();
RuleColl &ruleColl = allRules[&cp];
ruleColl.insert(rule);
Rule *rule = *iter;
const ConsistentPhrase &cp = rule->GetConsistentPhrase();
RuleColl &ruleColl = allRules[&cp];
ruleColl.insert(rule);
}
// fractional count
RuleByConsistentPhrase::iterator iterOuter;
for (iterOuter = allRules.begin(); iterOuter != allRules.end(); ++iterOuter) {
RuleColl &rules = iterOuter->second;
RuleColl &rules = iterOuter->second;
RuleColl::iterator iterInner;
for (iterInner = rules.begin(); iterInner != rules.end(); ++iterInner) {
Rule &rule = **iterInner;
rule.SetCount(1.0f / (float) rules.size());
}
RuleColl::iterator iterInner;
for (iterInner = rules.begin(); iterInner != rules.end(); ++iterInner) {
Rule &rule = **iterInner;
rule.SetCount(1.0f / (float) rules.size());
}
}
}

View File

@ -18,55 +18,59 @@ class AlignedSentence;
class Parameter;
struct CompareRules {
bool operator()(const Rule *a, const Rule *b) {
int compare;
bool operator()(const Rule *a, const Rule *b)
{
int compare;
compare = a->GetPhrase(Moses::Input).Compare(b->GetPhrase(Moses::Input));
if (compare) return compare < 0;
compare = a->GetPhrase(Moses::Input).Compare(b->GetPhrase(Moses::Input));
if (compare) return compare < 0;
compare = a->GetPhrase(Moses::Output).Compare(b->GetPhrase(Moses::Output));
if (compare) return compare < 0;
compare = a->GetPhrase(Moses::Output).Compare(b->GetPhrase(Moses::Output));
if (compare) return compare < 0;
if (a->GetAlignments() != b->GetAlignments()) {
return a->GetAlignments() < b->GetAlignments();
}
if (a->GetAlignments() != b->GetAlignments()) {
return a->GetAlignments() < b->GetAlignments();
}
if (a->GetLHS().GetString() != b->GetLHS().GetString()) {
return a->GetLHS().GetString() < b->GetLHS().GetString();
}
if (a->GetLHS().GetString() != b->GetLHS().GetString()) {
return a->GetLHS().GetString() < b->GetLHS().GetString();
}
return false;
}
if (a->GetProperties() != b->GetProperties()) {
return a->GetProperties() < b->GetProperties();
}
return false;
}
};
class Rules
{
class Rules {
public:
Rules(const AlignedSentence &alignedSentence);
virtual ~Rules();
void Extend(const Parameter &params);
void Consolidate(const Parameter &params);
Rules(const AlignedSentence &alignedSentence);
virtual ~Rules();
void Extend(const Parameter &params);
void Consolidate(const Parameter &params);
std::string Debug() const;
void Output(std::ostream &out, bool forward, const Parameter &params) const;
std::string Debug() const;
void Output(std::ostream &out, bool forward, const Parameter &params) const;
protected:
const AlignedSentence &m_alignedSentence;
std::set<Rule*> m_keepRules;
std::set<Rule*, CompareRules> m_mergeRules;
const AlignedSentence &m_alignedSentence;
std::set<Rule*> m_keepRules;
std::set<Rule*, CompareRules> m_mergeRules;
void Extend(const Rule &rule, const Parameter &params);
void Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params);
void Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params);
void Extend(const Rule &rule, const Parameter &params);
void Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter &params);
void Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter &params);
// create original rules
void CreateRules(const ConsistentPhrase &cp,
const Parameter &params);
void CreateRule(const NonTerm &nonTerm,
const Parameter &params);
// create original rules
void CreateRules(const ConsistentPhrase &cp,
const Parameter &params);
void CreateRule(const NonTerm &nonTerm,
const Parameter &params);
void MergeRules(const Parameter &params);
void CalcFractionalCount();
void MergeRules(const Parameter &params);
void CalcFractionalCount();
};

View File

@ -7,40 +7,41 @@ using namespace std;
void SyntaxTree::Add(int startPos, int endPos, const std::string &label, const Parameter &params)
{
//cerr << "add " << label << " to " << "[" << startPos << "-" << endPos << "]" << endl;
//cerr << "add " << label << " to " << "[" << startPos << "-" << endPos << "]" << endl;
Range range(startPos, endPos);
Labels &labels = m_coll[range];
Range range(startPos, endPos);
Labels &labels = m_coll[range];
bool add = true;
if (labels.size()) {
if (params.multiLabel == 1) {
// delete the label in collection and add new
assert(labels.size() == 1);
labels.clear();
} else if (params.multiLabel == 2) {
// ignore this label
add = false;
}
}
bool add = true;
if (labels.size()) {
if (params.multiLabel == 1) {
// delete the label in collection and add new
assert(labels.size() == 1);
labels.clear();
}
else if (params.multiLabel == 2) {
// ignore this label
add = false;
}
}
if (add) {
labels.push_back(label);
}
if (add) {
labels.push_back(label);
}
}
void SyntaxTree::AddToAll(const std::string &label)
{
Coll::iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
Labels &labels = iter->second;
labels.push_back(label);
}
Coll::iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
Labels &labels = iter->second;
labels.push_back(label);
}
}
const SyntaxTree::Labels &SyntaxTree::Find(int startPos, int endPos) const
{
Coll::const_iterator iter;
iter = m_coll.find(Range(startPos, endPos));
return (iter == m_coll.end()) ? m_defaultLabels : iter->second;
Coll::const_iterator iter;
iter = m_coll.find(Range(startPos, endPos));
return (iter == m_coll.end()) ? m_defaultLabels : iter->second;
}

View File

@ -1,4 +1,4 @@
#pragma once
#pragma once
#include <vector>
#include <map>
@ -19,7 +19,7 @@ public:
const Labels &Find(int startPos, int endPos) const;
void SetHieroLabel(const std::string &label) {
m_defaultLabels.push_back(label);
m_defaultLabels.push_back(label);
}

View File

@ -11,45 +11,44 @@
using namespace std;
Word::Word(int pos, const std::string &str)
:m_pos(pos)
,m_str(str)
:m_pos(pos)
,m_str(str)
{
// TODO Auto-generated constructor stub
// TODO Auto-generated constructor stub
}
Word::~Word()
{
// TODO Auto-generated destructor stub
Word::~Word() {
// TODO Auto-generated destructor stub
}
void Word::AddAlignment(const Word *other)
{
m_alignment.insert(other);
m_alignment.insert(other);
}
std::set<int> Word::GetAlignmentIndex() const
{
std::set<int> ret;
std::set<int> ret;
std::set<const Word *>::const_iterator iter;
for (iter = m_alignment.begin(); iter != m_alignment.end(); ++iter) {
const Word &otherWord = **iter;
int otherPos = otherWord.GetPos();
ret.insert(otherPos);
}
std::set<const Word *>::const_iterator iter;
for (iter = m_alignment.begin(); iter != m_alignment.end(); ++iter) {
const Word &otherWord = **iter;
int otherPos = otherWord.GetPos();
ret.insert(otherPos);
}
return ret;
return ret;
}
void Word::Output(std::ostream &out) const
{
out << m_str;
out << m_str;
}
std::string Word::Debug() const
{
return m_str;
return m_str;
}
int Word::CompareString(const Word &other) const

View File

@ -14,40 +14,36 @@
class Word : public RuleSymbol
{
public:
Word(const Word&); // do not implement
Word(int pos, const std::string &str);
virtual ~Word();
Word(const Word&); // do not implement
Word(int pos, const std::string &str);
virtual ~Word();
virtual bool IsNonTerm() const {
return false;
}
virtual bool IsNonTerm() const
{ return false; }
std::string GetString() const {
return m_str;
}
std::string GetString() const
{ return m_str; }
std::string GetString(int factor) const;
std::string GetString(int factor) const;
int GetPos() const {
return m_pos;
}
int GetPos() const
{ return m_pos; }
void AddAlignment(const Word *other);
void AddAlignment(const Word *other);
const std::set<const Word *> &GetAlignment() const {
return m_alignment;
}
const std::set<const Word *> &GetAlignment() const
{ return m_alignment; }
std::set<int> GetAlignmentIndex() const;
std::set<int> GetAlignmentIndex() const;
void Output(std::ostream &out) const;
std::string Debug() const;
void Output(std::ostream &out) const;
std::string Debug() const;
int CompareString(const Word &other) const;
int CompareString(const Word &other) const;
protected:
int m_pos; // original position in sentence, NOT in lattice
std::string m_str;
std::set<const Word *> m_alignment;
int m_pos; // original position in sentence, NOT in lattice
std::string m_str;
std::set<const Word *> m_alignment;
};

View File

@ -5,77 +5,73 @@
#include <zlib.h>
#include <cstring>
class gzfilebuf : public std::streambuf
{
class gzfilebuf : public std::streambuf {
public:
gzfilebuf(const char *filename) {
_gzf = gzopen(filename, "rb");
gzfilebuf(const char *filename)
{ _gzf = gzopen(filename, "rb");
setg (_buff+sizeof(int), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)); // end position
}
~gzfilebuf() {
gzclose(_gzf);
}
~gzfilebuf() { gzclose(_gzf); }
protected:
virtual int_type overflow (int_type c) {
throw;
throw;
}
// write multiple characters
virtual
std::streamsize xsputn (const char* s,
std::streamsize num) {
throw;
throw;
}
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
throw;
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
}
//read one character
virtual int_type underflow () {
// is read position before end of _buff?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback);
// read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback);
// read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
}
std::streamsize xsgetn (char* s,
std::streamsize num) {
return gzread(_gzf,s,num);
}
private:
gzFile _gzf;
static const unsigned int _buffsize = 1024;

File diff suppressed because it is too large Load Diff