Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Hieu Hoang 2015-06-04 16:34:19 +04:00
commit 0e11919ffb
29 changed files with 421 additions and 782 deletions

View File

@ -20,60 +20,23 @@
#pragma once
#include <map>
#include <sstream>
#include <string>
#include <vector>
namespace MosesTraining
{
namespace MosesTraining {
class SyntaxNode
{
protected:
int m_start, m_end;
std::string m_label;
std::vector< SyntaxNode* > m_children;
SyntaxNode* m_parent;
float m_pcfgScore;
public:
struct SyntaxNode {
typedef std::map<std::string, std::string> AttributeMap;
AttributeMap attributes;
SyntaxNode(const std::string &label_, int start_, int end_)
: label(label_)
, start(start_)
, end(end_) {
}
SyntaxNode( int startPos, int endPos, std::string label )
:m_start(startPos)
,m_end(endPos)
,m_label(label)
,m_parent(0)
,m_pcfgScore(0.0f) {
}
int GetStart() const {
return m_start;
}
int GetEnd() const {
return m_end;
}
std::string GetLabel() const {
return m_label;
}
float GetPcfgScore() const {
return m_pcfgScore;
}
void SetPcfgScore(float score) {
m_pcfgScore = score;
}
SyntaxNode *GetParent() {
return m_parent;
}
void SetParent(SyntaxNode *parent) {
m_parent = parent;
}
void AddChild(SyntaxNode* child) {
m_children.push_back(child);
}
const std::vector< SyntaxNode* > &GetChildren() const {
return m_children;
}
std::string label;
int start;
int end;
AttributeMap attributes;
};
} // namespace MosesTraining

View File

@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection()
void SyntaxNodeCollection::Clear()
{
m_top = 0;
// loop through all m_nodes, delete them
for(size_t i=0; i<m_nodes.size(); i++) {
delete m_nodes[i];
@ -45,113 +44,32 @@ void SyntaxNodeCollection::Clear()
SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
const std::string &label)
{
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
m_nodes.push_back( newNode );
m_index[ startPos ][ endPos ].push_back( newNode );
m_size = std::max(endPos+1, m_size);
m_numWords = std::max(endPos+1, m_numWords);
return newNode;
}
ParentNodes SyntaxNodeCollection::Parse()
{
ParentNodes parents;
// looping through all spans of size >= 2
for( int length=2; length<=m_size; length++ ) {
for( int startPos = 0; startPos <= m_size-length; startPos++ ) {
if (HasNode( startPos, startPos+length-1 )) {
// processing one (parent) span
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
SplitPoints splitPoints;
splitPoints.push_back( startPos );
//std::cerr << " " << startPos;
int first = 1;
int covered = 0;
int found_somehing = 1; // break loop if nothing found
while( covered < length && found_somehing ) {
// find largest covering subspan (child)
// starting at last covered position
found_somehing = 0;
for( int midPos=length-first; midPos>covered; midPos-- ) {
if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
covered = midPos;
splitPoints.push_back( startPos+covered );
// std::cerr << " " << ( startPos+covered );
first = 0;
found_somehing = 1;
}
}
}
// std::cerr << std::endl;
parents.push_back( splitPoints );
}
}
}
return parents;
}
bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
{
return GetNodes( startPos, endPos).size() > 0;
}
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
int startPos, int endPos ) const
{
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
NodeIndex::const_iterator startIndex = m_index.find( startPos );
if (startIndex == m_index.end() )
return m_emptyNode;
SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
if (endIndex == startIndex->second.end())
return m_emptyNode;
return endIndex->second;
}
void SyntaxNodeCollection::ConnectNodes()
{
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
SyntaxNode *prev = 0;
// Iterate over all start indices from lowest to highest.
for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
const SyntaxTreeIndex2 &inner = p->second;
// Iterate over all end indices from highest to lowest.
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
const std::vector<SyntaxNode*> &nodes = q->second;
// Iterate over all nodes that cover the same span in order of tree
// depth, top-most first.
for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
r != nodes.rend(); ++r) {
SyntaxNode *node = *r;
if (!prev) {
// node is the root.
m_top = node;
node->SetParent(0);
} else if (prev->GetStart() == node->GetStart()) {
// prev is the parent of node.
assert(prev->GetEnd() >= node->GetEnd());
node->SetParent(prev);
prev->AddChild(node);
} else {
// prev is a descendant of node's parent. The lowest common
// ancestor of prev and node will be node's parent.
SyntaxNode *ancestor = prev->GetParent();
while (ancestor->GetEnd() < node->GetEnd()) {
ancestor = ancestor->GetParent();
}
assert(ancestor);
node->SetParent(ancestor);
ancestor->AddChild(node);
}
prev = node;
}
}
}
}
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
{
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
@ -163,14 +81,15 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
}
// Connect the SyntaxTrees.
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
typedef NodeIndex::const_iterator OuterIterator;
typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
SyntaxTree *root = 0;
SyntaxNode *prevNode = 0;
SyntaxTree *prevTree = 0;
// Iterate over all start indices from lowest to highest.
for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
const SyntaxTreeIndex2 &inner = p->second;
for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
const InnerNodeIndex &inner = p->second;
// Iterate over all end indices from highest to lowest.
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
const std::vector<SyntaxNode*> &nodes = q->second;
@ -184,16 +103,16 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
// node is the root.
root = tree;
tree->parent() = 0;
} else if (prevNode->GetStart() == node->GetStart()) {
} else if (prevNode->start == node->start) {
// prevNode is the parent of node.
assert(prevNode->GetEnd() >= node->GetEnd());
assert(prevNode->end >= node->end);
tree->parent() = prevTree;
prevTree->children().push_back(tree);
} else {
// prevNode is a descendant of node's parent. The lowest common
// ancestor of prevNode and node will be node's parent.
SyntaxTree *ancestor = prevTree->parent();
while (ancestor->value().GetEnd() < tree->value().GetEnd()) {
while (ancestor->value().end < tree->value().end) {
ancestor = ancestor->parent();
}
assert(ancestor);

View File

@ -31,49 +31,47 @@
namespace MosesTraining
{
typedef std::vector< int > SplitPoints;
typedef std::vector< SplitPoints > ParentNodes;
/** A collection of SyntaxNodes organized by start and end position.
*
*/
class SyntaxNodeCollection
{
protected:
std::vector< SyntaxNode* > m_nodes;
SyntaxNode* m_top;
typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
SyntaxTreeIndex m_index;
int m_size;
std::vector< SyntaxNode* > m_emptyNode;
public:
SyntaxNodeCollection()
: m_top(0) // m_top doesn't get set unless ConnectNodes is called.
, m_size(0) {}
SyntaxNodeCollection() : m_numWords(0) {}
~SyntaxNodeCollection();
//! Construct and insert a new SyntaxNode.
SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
SyntaxNode *GetTop() {
return m_top;
}
ParentNodes Parse();
//! Return true iff there are one or more SyntaxNodes with the given span.
bool HasNode( int startPos, int endPos ) const;
//! Lookup the SyntaxNodes for a given span.
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
const std::vector< SyntaxNode* >& GetAllNodes() {
return m_nodes;
};
//! Get a vector of pointers to all SyntaxNodes (unordered).
const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; };
size_t GetNumWords() const {
return m_size;
return m_numWords;
}
void ConnectNodes();
void Clear();
std::auto_ptr<SyntaxTree> ExtractTree();
private:
typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
typedef std::map< int, InnerNodeIndex > NodeIndex;
// Not copyable.
SyntaxNodeCollection(const SyntaxNodeCollection &);
SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
std::vector< SyntaxNode* > m_nodes;
NodeIndex m_index;
int m_numWords;
std::vector< SyntaxNode* > m_emptyNode;
};
} // namespace MosesTraining

View File

@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
float pcfgScore = pcfgString == "" ? 0.0f
: std::atof(pcfgString.c_str());
// report what we have processed so far
if (0) {
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
node->SetPcfgScore(pcfgScore);
ParseXmlTagAttributes(tagContent, node->attributes);
}
}
@ -424,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
SyntaxNode *n = *node;
const string &label = n->GetLabel();
const string &label = n->label;
if (topLabelCollection.find( label ) == topLabelCollection.end())
topLabelCollection[ label ] = 0;
topLabelCollection[ label ]++;

View File

@ -21,6 +21,7 @@
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <memory>
#include <stack>
@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
{
NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
std::auto_ptr<Node> n(new Node(root->value().label, nodeType));
if (nodeType == TREE) {
float score = 0.0f;

View File

@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
}
// Target label sets for producing glue grammar.
std::set<std::string> targetLabelSet;
std::map<std::string, int> targetTopLabelSet;
// Source label sets for producing glue grammar.
std::set<std::string> sourceLabelSet;
std::map<std::string, int> sourceTopLabelSet;
// Word count statistics for producing unknown word labels.
std::map<std::string, int> targetWordCount;
std::map<std::string, std::string> targetWordLabel;
@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::string sourceLine;
std::string alignmentLine;
Alignment alignment;
Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
Syntax::XmlTreeParser targetXmlTreeParser;
Syntax::XmlTreeParser sourceXmlTreeParser;
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;
@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
Error(oss.str());
}
sourceTokens = sourceXmlTreeParser.GetWords();
sourceTokens = sourceXmlTreeParser.words();
}
// Read word alignments.
@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
// Initialize phrase orientation scoring object
PhraseOrientation phraseOrientation(sourceTokens.size(),
targetXmlTreeParser.GetWords().size(), alignment);
targetXmlTreeParser.words().size(), alignment);
// Write the rules, subject to scope pruning.
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
// SCFG output.
ScfgRule *r = 0;
if (options.sourceLabels) {
r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection());
r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
} else {
r = new ScfgRule(**q);
}
@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::map<std::string,size_t> sourceLabels;
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar
sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
extendedLabelSet.insert("TOPLABEL"); // as used in the glue grammar
extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
size_t index = 0;
for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
iter!=sourceLabelSet.end(); ++iter, ++index) {
for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
iter!=extendedLabelSet.end(); ++iter, ++index) {
sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
}
WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::map<std::string, int> strippedTargetTopLabelSet;
if (options.stripBitParLabels &&
(!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet);
StripBitParLabels(targetXmlTreeParser.label_set(),
targetXmlTreeParser.top_label_set(),
strippedTargetLabelSet, strippedTargetTopLabelSet);
}
if (!options.glueGrammarFile.empty()) {
if (options.stripBitParLabels) {
WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
} else {
WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
WriteGlueGrammar(targetXmlTreeParser.label_set(),
targetXmlTreeParser.top_label_set(),
sourceLabels, options, glueGrammarStream);
}
}
@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
if (options.stripBitParLabels) {
WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
} else {
WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
unknownWordSoftMatchesStream);
}
}
@ -816,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts(
for (SyntaxTree::ConstLeafIterator p(root);
p != SyntaxTree::ConstLeafIterator(); ++p) {
const SyntaxTree &leaf = *p;
const std::string &word = leaf.value().GetLabel();
const std::string &word = leaf.value().label;
const SyntaxTree *ancestor = leaf.parent();
// If unary rule elimination is enabled and this word is at the end of a
// chain of unary rewrites, e.g.
@ -828,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts(
ancestor->parent()->children().size() == 1) {
ancestor = ancestor->parent();
}
const std::string &label = ancestor->value().GetLabel();
const std::string &label = ancestor->value().label;
++wordCount[word];
wordLabel[word] = label;
}
@ -840,7 +837,7 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
for (SyntaxTree::ConstLeafIterator p(root);
p != SyntaxTree::ConstLeafIterator(); ++p) {
const SyntaxTree &leaf = *p;
const std::string &word = leaf.value().GetLabel();
const std::string &word = leaf.value().label;
tokens.push_back(word);
}
return tokens;

View File

@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
sourceNodeCollection->GetNodes(span.first,span.second);
if (!sourceLabels.empty()) {
// store the topmost matching label from the source syntax tree
m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
m_sourceLabels.push_back(sourceLabels.back()->label);
}
} else {
// no matching source-side syntactic constituent: store nonMatchingLabel

View File

@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
void writeUnknownWordLabel(const string &);
double getPcfgScore(const SyntaxNode &);
int main(int argc, char* argv[])
{
@ -505,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
int labelI = labelIndex[ 2+holeCount+holeTotal ];
string label = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
hole.SetLabel(label, 0);
currPos = hole.GetEnd(0);
@ -548,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
int labelI = labelIndex[ 2+holeCount ];
string targetLabel;
if (m_options.targetSyntax) {
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
} else {
@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
}
if (m_options.pcfgScore) {
double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
logPCFGScore -= score;
logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
}
currPos = hole.GetEnd(1);
@ -674,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
// phrase labels
string targetLabel;
if (m_options.targetSyntax) {
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
} else {
@ -682,14 +683,14 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
}
string sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
// create non-terms on the source side
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
if (m_options.pcfgScore) {
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
@ -946,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else {
sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
if (m_options.targetSyntax) {
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
} else {
@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
rule.target += "[" + targetLabel + "]";
if (m_options.pcfgScore) {
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
rule.pcfgScore = std::exp(logPCFGScore);
}
@ -1165,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
if (labels.size() > 0) {
wordCount[ word ]++;
wordLabel[ word ] = labels[0]->GetLabel();
wordLabel[ word ] = labels[0]->label;
}
}
}
@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName)
outFile.close();
}
double getPcfgScore(const SyntaxNode &node)
{
double score = 0.0f;
SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
if (p != node.attributes.end()) {
score = std::atof(p->second.c_str());
}
return score;
}

View File

@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet(
void FilterRuleTable::ReadTestSet(
std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
std::set<std::string> labelSet;
std::map<std::string, int> topLabelSet;
XmlTreeParser parser(labelSet, topLabelSet);
XmlTreeParser parser;
int lineNum = 0;
std::string line;
while (std::getline(input, line)) {

View File

@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter(
TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
{
IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
IdTree *t = new IdTree(m_testVocab.Insert(s.value().label));
const std::vector<SyntaxTree*> &sChildren = s.children();
std::vector<IdTree*> &tChildren = t->children();
tChildren.reserve(sChildren.size());

View File

@ -1,79 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef PCFG_PCFG_TREE_H_
#define PCFG_PCFG_TREE_H_
#include <string>
#include "syntax_tree.h"
#include "xml_tree_writer.h"
namespace MosesTraining {
namespace Syntax {
namespace PCFG {
template<typename DerivedType>
class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
public:
typedef std::string LabelType;
typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
double score() const { return score_; }
void set_score(double s) { score_ = s; }
private:
double score_;
};
class PcfgTree : public PcfgTreeBase<PcfgTree> {
public:
typedef PcfgTreeBase<PcfgTree> BaseType;
PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
};
// Specialise XmlOutputHandler for PcfgTree.
template<>
class XmlOutputHandler<PcfgTree> {
public:
typedef std::map<std::string, std::string> AttributeMap;
void GetLabel(const PcfgTree &tree, std::string &label) const {
label = tree.label();
}
void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
attribute_map.clear();
double score = tree.score();
if (score != 0.0) {
std::ostringstream out;
out << tree.score();
attribute_map["pcfg"] = out.str();
}
}
};
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -1,93 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef PCFG_SYNTAX_TREE_H_
#define PCFG_SYNTAX_TREE_H_
#include <cassert>
#include <vector>
namespace MosesTraining {
namespace Syntax {
namespace PCFG {
// Base class for SyntaxTree, AgreementTree, and friends.
template<typename T, typename DerivedType>
class SyntaxTreeBase {
public:
// Constructors
SyntaxTreeBase(const T &label)
: label_(label)
, children_()
, parent_(0) {}
SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
: label_(label)
, children_(children)
, parent_(0) {}
// Destructor
virtual ~SyntaxTreeBase();
const T &label() const { return label_; }
const DerivedType *parent() const { return parent_; }
DerivedType *parent() { return parent_; }
const std::vector<DerivedType *> &children() const { return children_; }
std::vector<DerivedType *> &children() { return children_; }
void set_label(const T &label) { label_ = label; }
void set_parent(DerivedType *parent) { parent_ = parent; }
void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
bool IsLeaf() const { return children_.empty(); }
bool IsPreterminal() const {
return children_.size() == 1 && children_[0]->IsLeaf();
}
void AddChild(DerivedType *child) { children_.push_back(child); }
private:
T label_;
std::vector<DerivedType *> children_;
DerivedType *parent_;
};
template<typename T>
class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
public:
typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
SyntaxTree(const T &label) : BaseType(label) {}
SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
: BaseType(label, children) {}
};
template<typename T, typename DerivedType>
SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
for (std::size_t i = 0; i < children_.size(); ++i) {
delete children_[i];
}
}
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -24,7 +24,6 @@
#include <string>
#include "syntax-common/numbered_set.h"
#include "syntax_tree.h"
namespace MosesTraining {
namespace Syntax {

View File

@ -1,89 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "xml_tree_parser.h"
#include <cassert>
#include <vector>
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
#include "util/tokenize.hh"
#include "syntax-common/exception.h"
namespace MosesTraining {
namespace Syntax {
namespace PCFG {
XmlTreeParser::XmlTreeParser() {
}
std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
m_line = line;
m_tree.Clear();
try {
if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
throw Exception("");
}
} catch (const XmlException &e) {
throw Exception(e.getMsg());
}
m_tree.ConnectNodes();
SyntaxNode *root = m_tree.GetTop();
if (!root) {
// There is no XML tree.
return std::auto_ptr<PcfgTree>();
}
m_words = util::tokenize(m_line);
return ConvertTree(*root, m_words);
}
// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
const SyntaxNode &tree,
const std::vector<std::string> &words) {
std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
const std::vector<SyntaxNode*> &children = tree.GetChildren();
if (children.empty()) {
if (tree.GetStart() != tree.GetEnd()) {
std::ostringstream msg;
msg << "leaf node covers multiple words (" << tree.GetStart()
<< "-" << tree.GetEnd() << "): this is currently unsupported";
throw Exception(msg.str());
}
std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
leaf->set_parent(root.get());
root->AddChild(leaf.release());
} else {
for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
p != children.end(); ++p) {
assert(*p);
std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
child->set_parent(root.get());
root->AddChild(child.release());
}
}
return root;
}
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining

View File

@ -1,59 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef PCFG_XML_TREE_PARSER_H_
#define PCFG_XML_TREE_PARSER_H_
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "pcfg_tree.h"
#include "SyntaxNode.h"
#include "SyntaxNodeCollection.h"
namespace MosesTraining {
namespace Syntax {
namespace PCFG {
// Parses a string in Moses' XML parse tree format and returns a PcfgTree
// object.
class XmlTreeParser {
public:
XmlTreeParser();
std::auto_ptr<PcfgTree> Parse(const std::string &);
private:
std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
const std::vector<std::string> &);
std::set<std::string> m_labelSet;
std::map<std::string, int> m_topLabelSet;
std::string m_line;
MosesTraining::SyntaxNodeCollection m_tree;
std::vector<std::string> m_words;
};
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -1,135 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef PCFG_XML_TREE_WRITER_H_
#define PCFG_XML_TREE_WRITER_H_
#include <cassert>
#include <map>
#include <memory>
#include <ostream>
#include <vector>
#include <string>
#include "XmlTree.h"
#include "syntax_tree.h"
namespace MosesTraining {
namespace Syntax {
namespace PCFG {
template<typename InputTree>
class XmlOutputHandler {
public:
typedef std::map<std::string, std::string> AttributeMap;
void GetLabel(const InputTree &, std::string &) const;
void GetAttributes(const InputTree &, AttributeMap &) const;
};
template<typename InputTree>
class XmlTreeWriter : public XmlOutputHandler<InputTree> {
public:
typedef XmlOutputHandler<InputTree> Base;
void Write(const InputTree &, std::ostream &) const;
private:
std::string Escape(const std::string &) const;
};
template<typename InputTree>
void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
std::ostream &out) const {
assert(!tree.IsLeaf());
// Opening tag
std::string label;
Base::GetLabel(tree, label);
out << "<tree label=\"" << Escape(label) << "\"";
typename Base::AttributeMap attribute_map;
Base::GetAttributes(tree, attribute_map);
for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
p != attribute_map.end(); ++p) {
out << " " << p->first << "=\"" << p->second << "\"";
}
out << ">";
// Children
const std::vector<InputTree *> &children = tree.children();
for (typename std::vector<InputTree *>::const_iterator p = children.begin();
p != children.end(); ++p) {
InputTree &child = **p;
if (child.IsLeaf()) {
Base::GetLabel(child, label);
out << " " << Escape(label);
} else {
out << " ";
Write(**p, out);
}
}
// Closing tag
out << " </tree>";
if (tree.parent() == 0) {
out << std::endl;
}
}
// Escapes XML special characters.
template<typename InputTree>
std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
std::string t;
std::size_t len = s.size();
t.reserve(len);
for (std::size_t i = 0; i < len; ++i) {
if (s[i] == '<') {
t += "&lt;";
} else if (s[i] == '>') {
t += "&gt;";
} else if (s[i] == '[') {
t += "&#91;";
} else if (s[i] == ']') {
t += "&#93;";
} else if (s[i] == '|') {
t += "&#124;";
} else if (s[i] == '&') {
t += "&amp;";
} else if (s[i] == '\'') {
t += "&apos;";
} else if (s[i] == '"') {
t += "&quot;";
} else {
t += s[i];
}
}
return t;
}
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -1 +1 @@
exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : <include>.. ;

View File

@ -19,20 +19,6 @@
#include "pcfg_extract.h"
#include "options.h"
#include "rule_collection.h"
#include "rule_extractor.h"
#include "syntax-common/exception.h"
#include "pcfg-common/pcfg.h"
#include "pcfg-common/pcfg_tree.h"
#include "pcfg-common/syntax_tree.h"
#include "pcfg-common/typedef.h"
#include "pcfg-common/xml_tree_parser.h"
#include <boost/program_options.hpp>
#include <cassert>
#include <cstdlib>
#include <fstream>
@ -43,6 +29,20 @@
#include <string>
#include <vector>
#include <boost/program_options.hpp>
#include "syntax-common/exception.h"
#include "syntax-common/xml_tree_parser.h"
#include "SyntaxTree.h"
#include "pcfg-common/pcfg.h"
#include "pcfg-common/typedef.h"
#include "options.h"
#include "rule_collection.h"
#include "rule_extractor.h"
namespace MosesTraining
{
namespace Syntax
@ -63,7 +63,7 @@ int PcfgExtract::Main(int argc, char *argv[])
XmlTreeParser parser;
std::string line;
std::size_t line_num = 0;
std::auto_ptr<PcfgTree> tree;
std::auto_ptr<MosesTraining::SyntaxTree> tree;
while (std::getline(std::cin, line)) {
++line_num;
try {

View File

@ -19,8 +19,6 @@
#include "rule_extractor.h"
#include "pcfg-common/pcfg_tree.h"
namespace MosesTraining
{
namespace Syntax
@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
{
}
void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const
void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
{
if (tree.IsPreterminal() || tree.IsLeaf()) {
if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) {
return;
}
std::size_t lhs = non_term_vocab_.Insert(tree.label());
std::size_t lhs = non_term_vocab_.Insert(tree.value().label);
std::vector<std::size_t> rhs;
const std::vector<PcfgTree *> &children = tree.children();
const std::vector<SyntaxTree *> &children = tree.children();
rhs.reserve(children.size());
for (std::vector<PcfgTree *>::const_iterator p(children.begin());
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
p != children.end(); ++p) {
const PcfgTree &child = **p;
rhs.push_back(non_term_vocab_.Insert(child.label()));
const SyntaxTree &child = **p;
rhs.push_back(non_term_vocab_.Insert(child.value().label));
Extract(child, rc);
}
rc.Add(lhs, rhs);

View File

@ -21,6 +21,8 @@
#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
#include "SyntaxTree.h"
#include "pcfg-common/typedef.h"
#include "rule_collection.h"
@ -32,14 +34,12 @@ namespace Syntax
namespace PCFG
{
class PcfgTree;
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
class RuleExtractor
{
public:
RuleExtractor(Vocabulary &);
void Extract(const PcfgTree &, RuleCollection &) const;
void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const;
private:
Vocabulary &non_term_vocab_;
};

View File

@ -33,13 +33,14 @@
#include <boost/program_options.hpp>
#include "SyntaxTree.h"
#include "syntax-common/exception.h"
#include "syntax-common/xml_tree_parser.h"
#include "syntax-common/xml_tree_writer.h"
#include "pcfg-common/pcfg.h"
#include "pcfg-common/pcfg_tree.h"
#include "pcfg-common/syntax_tree.h"
#include "pcfg-common/typedef.h"
#include "pcfg-common/xml_tree_parser.h"
namespace MosesTraining
{
@ -66,14 +67,14 @@ int PcfgScore::Main(int argc, char *argv[])
// Score corpus according to PCFG.
TreeScorer scorer(pcfg, non_term_vocab);
XmlTreeParser parser;
XmlTreeWriter<PcfgTree> writer;
XmlTreeWriter writer(std::cout);
std::string line;
std::size_t line_num = 0;
std::auto_ptr<PcfgTree> tree;
std::auto_ptr<SyntaxTree> tree;
while (std::getline(std::cin, line)) {
++line_num;
try {
tree = parser.Parse(line);
tree = parser.Parse(line, true);
} catch (Exception &e) {
std::ostringstream msg;
msg << "line " << line_num << ": " << e.msg();
@ -93,7 +94,7 @@ int PcfgScore::Main(int argc, char *argv[])
std::cout << line << std::endl;
continue;
}
writer.Write(*tree, std::cout);
writer.Write(*tree);
}
return 0;

View File

@ -20,6 +20,7 @@
#include "tree_scorer.h"
#include <cassert>
#include <sstream>
namespace MosesTraining
{
@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
{
}
bool TreeScorer::Score(PcfgTree &root) const
bool TreeScorer::Score(SyntaxTree &root)
{
if (root.IsPreterminal() || root.IsLeaf()) {
scores_.clear();
ZeroScores(root);
if (!CalcScores(root)) {
return false;
}
SetAttributes(root);
return true;
}
bool TreeScorer::CalcScores(SyntaxTree &root)
{
if (root.IsLeaf() || root.children()[0]->IsLeaf()) {
return true;
}
const std::vector<PcfgTree *> &children = root.children();
const std::vector<SyntaxTree *> &children = root.children();
double log_prob = 0.0;
std::vector<std::size_t> key;
key.reserve(children.size()+1);
key.push_back(non_term_vocab_.Lookup(root.label()));
key.push_back(non_term_vocab_.Lookup(root.value().label));
for (std::vector<PcfgTree *>::const_iterator p(children.begin());
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
p != children.end(); ++p) {
PcfgTree *child = *p;
SyntaxTree *child = *p;
assert(!child->IsLeaf());
key.push_back(non_term_vocab_.Lookup(child->label()));
if (!Score(*child)) {
key.push_back(non_term_vocab_.Lookup(child->value().label));
if (!CalcScores(*child)) {
return false;
}
if (!child->IsPreterminal()) {
log_prob += child->score();
if (!child->children()[0]->IsLeaf()) {
log_prob += scores_[child];
}
}
double rule_score;
@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const
return false;
}
log_prob += rule_score;
root.set_score(log_prob);
scores_[&root] = log_prob;
return true;
}
void TreeScorer::SetAttributes(SyntaxTree &root)
{
// Terminals don't need attributes.
if (root.IsLeaf()) {
return;
}
// Preterminals don't need attributes (they have the implicit score 0.0).
if (root.children()[0]->IsLeaf()) {
return;
}
double score = scores_[&root];
if (score != 0.0) {
std::ostringstream out;
out << score;
root.value().attributes["pcfg"] = out.str();
}
for (std::vector<SyntaxTree *>::const_iterator p(root.children().begin());
p != root.children().end(); ++p) {
SetAttributes(**p);
}
}
void TreeScorer::ZeroScores(SyntaxTree &root)
{
scores_[&root] = 0.0f;
const std::vector<SyntaxTree *> &children = root.children();
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
p != children.end(); ++p) {
ZeroScores(**p);
}
}
} // namespace PCFG
} // namespace Syntax
} // namespace MosesTraining

View File

@ -21,8 +21,9 @@
#ifndef PCFG_SCORE_TREE_SCORER_H_
#define PCFG_SCORE_TREE_SCORER_H_
#include "SyntaxTree.h"
#include "pcfg-common/pcfg.h"
#include "pcfg-common/pcfg_tree.h"
#include "pcfg-common/typedef.h"
namespace MosesTraining
@ -39,11 +40,16 @@ public:
// Score tree according to PCFG. Returns false if unsuccessful (due to
// missing rule).
bool Score(PcfgTree &) const;
bool Score(SyntaxTree &);
private:
const Pcfg &pcfg_;
const Vocabulary &non_term_vocab_;
std::map<SyntaxTree *, double> scores_;
bool CalcScores(SyntaxTree &);
void SetAttributes(SyntaxTree &);
void ZeroScores(SyntaxTree &);
};
} // namespace PCFG

View File

@ -50,7 +50,7 @@ int main(int argc, char* argv[])
// output tree
// cerr << "BEFORE:" << endl << tree;
ParentNodes parents = tree.Parse();
ParentNodes parents = determineSplitPoints(tree);
// execute selected grammar relaxation schemes
if (leftBinarizeFlag)
@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
// output tree nodes
vector< SyntaxNode* > nodes = tree.GetAllNodes();
for( size_t i=0; i<nodes.size(); i++ ) {
cout << " <tree span=\"" << nodes[i]->GetStart()
<< "-" << nodes[i]->GetEnd()
<< "\" label=\"" << nodes[i]->GetLabel()
cout << " <tree span=\"" << nodes[i]->start
<< "-" << nodes[i]->end
<< "\" label=\"" << nodes[i]->label
<< "\"/>";
}
cout << endl;
@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
if (point.size() > 3) {
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], point[point.size()-1]-1);
string topLabel = topNodes[0]->GetLabel();
string topLabel = topNodes[0]->label;
for(size_t i=2; i<point.size()-1; i++) {
// cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
@ -151,7 +151,7 @@ void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
int endPoint = point[point.size()-1]-1;
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], endPoint);
string topLabel = topNodes[0]->GetLabel();
string topLabel = topNodes[0]->label;
for(size_t i=1; i<point.size()-2; i++) {
// cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
@ -178,29 +178,29 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
// cerr << endl;
for(size_t i = 0; i+2 < point.size(); i++) {
// cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
// cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl;
newTree.AddNode( point[i],point[i+2]-1,
tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel()
tree.GetNodes(point[i ],point[i+1]-1)[0]->label
+ "+" +
tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
}
}
if (point.size() >= 4) {
int ps = point.size();
string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
// cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
// cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl;
newTree.AddNode( point[1],point[ps-1]-1,
topLabel
+ "\\" +
tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
tree.GetNodes(point[0],point[1]-1)[0]->label );
// cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
// cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl;
newTree.AddNode( point[0],point[ps-2]-1,
topLabel
+ "/" +
tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
}
}
@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
for(int mid=start+1; mid<=end && !done; mid++) {
if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
// cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl;
// cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid, end )[0]->label << endl;
newTree.AddNode( start, end,
tree.GetNodes(start,mid-1)[0]->GetLabel()
tree.GetNodes(start,mid-1)[0]->label
+ "++" +
tree.GetNodes(mid, end )[0]->GetLabel() );
tree.GetNodes(mid, end )[0]->label );
done = true;
}
}
@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
newTree.AddNode( start, end,
tree.GetNodes(start,postEnd)[0]->GetLabel()
tree.GetNodes(start,postEnd)[0]->label
+ "//" +
tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
tree.GetNodes(end+1,postEnd)[0]->label );
done = true;
}
}
@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
// if matching a constituent A left-minus constituent B: use A\\B
for(int preStart=start-1; preStart>=0; preStart--) {
if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
// cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
// cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->label << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->label << endl;
newTree.AddNode( start, end,
tree.GetNodes(preStart,end )[0]->GetLabel()
tree.GetNodes(preStart,end )[0]->label
+ "\\\\" +
tree.GetNodes(preStart,start-1)[0]->GetLabel() );
tree.GetNodes(preStart,start-1)[0]->label );
done = true;
}
}
@ -268,6 +268,48 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
// adding all new nodes
vector< SyntaxNode* > nodes = newTree.GetAllNodes();
for( size_t i=0; i<nodes.size(); i++ ) {
tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
}
}
ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
{
ParentNodes parents;
const std::size_t numWords = nodeColl.GetNumWords();
// looping through all spans of size >= 2
for( int length=2; length<=numWords; length++ ) {
for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
if (nodeColl.HasNode( startPos, startPos+length-1 )) {
// processing one (parent) span
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
SplitPoints splitPoints;
splitPoints.push_back( startPos );
//std::cerr << " " << startPos;
int first = 1;
int covered = 0;
int found_somehing = 1; // break loop if nothing found
while( covered < length && found_somehing ) {
// find largest covering subspan (child)
// starting at last covered position
found_somehing = 0;
for( int midPos=length-first; midPos>covered; midPos-- ) {
if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
covered = midPos;
splitPoints.push_back( startPos+covered );
// std::cerr << " " << ( startPos+covered );
first = 0;
found_somehing = 1;
}
}
}
// std::cerr << std::endl;
parents.push_back( splitPoints );
}
}
}
return parents;
}

View File

@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
bool rightBinarizeFlag = false;
char SAMTLevel = 0;
typedef std::vector< int > SplitPoints;
typedef std::vector< SplitPoints > ParentNodes;
// functions
void init(int argc, char* argv[]);
ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );

View File

@ -10,30 +10,26 @@
#include "XmlException.h"
#include "XmlTree.h"
#include "exception.h"
namespace MosesTraining {
namespace Syntax {
XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
std::map<std::string, int> &topLabelSet)
: label_set_(labelSet)
, top_label_set_(topLabelSet)
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
bool unescape)
{
}
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
{
line_ = line;
sentence_ = line;
node_collection_.Clear();
try {
if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
top_label_set_, false)) {
if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_,
top_label_set_, unescape)) {
throw Exception("");
}
} catch (const XmlException &e) {
throw Exception(e.getMsg());
}
std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
words_ = util::tokenize(line_);
words_ = util::tokenize(sentence_);
AttachWords(words_, *root);
return root;
}
@ -51,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
++p) {
SyntaxTree *leaf = *p;
const int start = leaf->value().GetStart();
const int end = leaf->value().GetEnd();
const int start = leaf->value().start;
const int end = leaf->value().end;
if (start != end) {
std::ostringstream msg;
msg << "leaf node covers multiple words (" << start << "-" << end
<< "): this is currently unsupported";
throw Exception(msg.str());
}
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end));
leaf->children().push_back(newLeaf);
newLeaf->parent() = leaf;
}

View File

@ -6,39 +6,52 @@
#include <string>
#include <vector>
#include "SyntaxNode.h"
#include "SyntaxNodeCollection.h"
#include "SyntaxTree.h"
#include "exception.h"
namespace MosesTraining {
namespace Syntax {
// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
// object. This is a wrapper around the ProcessAndStripXMLTags function.
/** Parses string representations of parse trees in Moses' XML format and
* converts them to SyntaxTree objects.
*
* This is a thin wrapper around the ProcessAndStripXMLTags function. After
* calling Parse(), the output of the ProcessAndStripXMLTags function (the
* sentence, node collection, label set, and top label set) are available via
* accessors.
*/
class XmlTreeParser {
public:
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
//! Parse a single sentence and return a SyntaxTree (with words attached).
std::auto_ptr<SyntaxTree> Parse(const std::string &, bool unescape=false);
std::auto_ptr<SyntaxTree> Parse(const std::string &);
//! Get the sentence string (as returned by ProcessAndStripXMLTags).
const std::string &sentence() const { return sentence_; }
const std::vector<std::string>& GetWords() {
return words_;
}
//! Get the sentence as a vector of words.
const std::vector<std::string> &words() const { return words_; }
const SyntaxNodeCollection &GetNodeCollection() const {
//! Get the node collection (as returned by ProcessAndStripXMLTags).
const SyntaxNodeCollection &node_collection() const {
return node_collection_;
}
private:
std::set<std::string> &label_set_;
std::map<std::string, int> &top_label_set_;
std::string line_;
SyntaxNodeCollection node_collection_;
std::vector<std::string> words_;
//! Get the label set (as returned by ProcessAndStripXMLTags).
const std::set<std::string> &label_set() const { return label_set_; }
//! Get the top label set (as returned by ProcessAndStripXMLTags).
const std::map<std::string, int> &top_label_set() const {
return top_label_set_;
}
private:
void AttachWords(const std::vector<std::string> &, SyntaxTree &);
std::string sentence_;
SyntaxNodeCollection node_collection_;
std::set<std::string> label_set_;
std::map<std::string, int> top_label_set_;
std::vector<std::string> words_;
};
} // namespace Syntax

View File

@ -0,0 +1,82 @@
#include "xml_tree_writer.h"
#include <cassert>
#include <ostream>
#include <vector>
#include <string>
#include "SyntaxTree.h"
#include "XmlTree.h"
namespace MosesTraining {
namespace Syntax {
void XmlTreeWriter::Write(const SyntaxTree &tree) const {
assert(!tree.IsLeaf());
// Opening tag
out_ << "<tree label=\"" << Escape(tree.value().label) << "\"";
for (SyntaxNode::AttributeMap::const_iterator
p = tree.value().attributes.begin();
p != tree.value().attributes.end(); ++p) {
if (p->first != "label") {
out_ << " " << p->first << "=\"" << p->second << "\"";
}
}
out_ << ">";
// Children
for (std::vector<SyntaxTree *>::const_iterator p = tree.children().begin();
p != tree.children().end(); ++p) {
SyntaxTree &child = **p;
if (child.IsLeaf()) {
out_ << " " << Escape(child.value().label);
} else {
out_ << " ";
Write(child);
}
}
// Closing tag
out_ << " </tree>";
if (tree.parent() == 0) {
out_ << std::endl;
}
}
// Escapes XML special characters.
std::string XmlTreeWriter::Escape(const std::string &s) const {
if (!escape_) {
return s;
}
std::string t;
std::size_t len = s.size();
t.reserve(len);
for (std::size_t i = 0; i < len; ++i) {
if (s[i] == '<') {
t += "&lt;";
} else if (s[i] == '>') {
t += "&gt;";
} else if (s[i] == '[') {
t += "&#91;";
} else if (s[i] == ']') {
t += "&#93;";
} else if (s[i] == '|') {
t += "&#124;";
} else if (s[i] == '&') {
t += "&amp;";
} else if (s[i] == '\'') {
t += "&apos;";
} else if (s[i] == '"') {
t += "&quot;";
} else {
t += s[i];
}
}
return t;
}
} // namespace Syntax
} // namespace MosesTraining

View File

@ -0,0 +1,27 @@
#pragma once
#include <ostream>
#include <string>
#include "SyntaxTree.h"
namespace MosesTraining {
namespace Syntax {
class XmlTreeWriter {
public:
XmlTreeWriter(std::ostream &out, bool escape=true)
: out_(out)
, escape_(escape) {}
void Write(const SyntaxTree &) const;
private:
std::string Escape(const std::string &) const;
std::ostream &out_;
bool escape_;
};
} // namespace Syntax
} // namespace MosesTraining