mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 11:28:48 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
0e11919ffb
@ -20,60 +20,23 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining {
|
||||||
{
|
|
||||||
|
|
||||||
class SyntaxNode
|
struct SyntaxNode {
|
||||||
{
|
|
||||||
protected:
|
|
||||||
int m_start, m_end;
|
|
||||||
std::string m_label;
|
|
||||||
std::vector< SyntaxNode* > m_children;
|
|
||||||
SyntaxNode* m_parent;
|
|
||||||
float m_pcfgScore;
|
|
||||||
public:
|
|
||||||
typedef std::map<std::string, std::string> AttributeMap;
|
typedef std::map<std::string, std::string> AttributeMap;
|
||||||
|
|
||||||
AttributeMap attributes;
|
SyntaxNode(const std::string &label_, int start_, int end_)
|
||||||
|
: label(label_)
|
||||||
|
, start(start_)
|
||||||
|
, end(end_) {
|
||||||
|
}
|
||||||
|
|
||||||
SyntaxNode( int startPos, int endPos, std::string label )
|
std::string label;
|
||||||
:m_start(startPos)
|
int start;
|
||||||
,m_end(endPos)
|
int end;
|
||||||
,m_label(label)
|
AttributeMap attributes;
|
||||||
,m_parent(0)
|
|
||||||
,m_pcfgScore(0.0f) {
|
|
||||||
}
|
|
||||||
int GetStart() const {
|
|
||||||
return m_start;
|
|
||||||
}
|
|
||||||
int GetEnd() const {
|
|
||||||
return m_end;
|
|
||||||
}
|
|
||||||
std::string GetLabel() const {
|
|
||||||
return m_label;
|
|
||||||
}
|
|
||||||
float GetPcfgScore() const {
|
|
||||||
return m_pcfgScore;
|
|
||||||
}
|
|
||||||
void SetPcfgScore(float score) {
|
|
||||||
m_pcfgScore = score;
|
|
||||||
}
|
|
||||||
SyntaxNode *GetParent() {
|
|
||||||
return m_parent;
|
|
||||||
}
|
|
||||||
void SetParent(SyntaxNode *parent) {
|
|
||||||
m_parent = parent;
|
|
||||||
}
|
|
||||||
void AddChild(SyntaxNode* child) {
|
|
||||||
m_children.push_back(child);
|
|
||||||
}
|
|
||||||
const std::vector< SyntaxNode* > &GetChildren() const {
|
|
||||||
return m_children;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace MosesTraining
|
} // namespace MosesTraining
|
||||||
|
@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection()
|
|||||||
|
|
||||||
void SyntaxNodeCollection::Clear()
|
void SyntaxNodeCollection::Clear()
|
||||||
{
|
{
|
||||||
m_top = 0;
|
|
||||||
// loop through all m_nodes, delete them
|
// loop through all m_nodes, delete them
|
||||||
for(size_t i=0; i<m_nodes.size(); i++) {
|
for(size_t i=0; i<m_nodes.size(); i++) {
|
||||||
delete m_nodes[i];
|
delete m_nodes[i];
|
||||||
@ -45,113 +44,32 @@ void SyntaxNodeCollection::Clear()
|
|||||||
SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
|
SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
|
||||||
const std::string &label)
|
const std::string &label)
|
||||||
{
|
{
|
||||||
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
|
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
|
||||||
m_nodes.push_back( newNode );
|
m_nodes.push_back( newNode );
|
||||||
m_index[ startPos ][ endPos ].push_back( newNode );
|
m_index[ startPos ][ endPos ].push_back( newNode );
|
||||||
m_size = std::max(endPos+1, m_size);
|
m_numWords = std::max(endPos+1, m_numWords);
|
||||||
return newNode;
|
return newNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
ParentNodes SyntaxNodeCollection::Parse()
|
|
||||||
{
|
|
||||||
ParentNodes parents;
|
|
||||||
|
|
||||||
// looping through all spans of size >= 2
|
|
||||||
for( int length=2; length<=m_size; length++ ) {
|
|
||||||
for( int startPos = 0; startPos <= m_size-length; startPos++ ) {
|
|
||||||
if (HasNode( startPos, startPos+length-1 )) {
|
|
||||||
// processing one (parent) span
|
|
||||||
|
|
||||||
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
|
|
||||||
SplitPoints splitPoints;
|
|
||||||
splitPoints.push_back( startPos );
|
|
||||||
//std::cerr << " " << startPos;
|
|
||||||
|
|
||||||
int first = 1;
|
|
||||||
int covered = 0;
|
|
||||||
int found_somehing = 1; // break loop if nothing found
|
|
||||||
while( covered < length && found_somehing ) {
|
|
||||||
// find largest covering subspan (child)
|
|
||||||
// starting at last covered position
|
|
||||||
found_somehing = 0;
|
|
||||||
for( int midPos=length-first; midPos>covered; midPos-- ) {
|
|
||||||
if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
|
|
||||||
covered = midPos;
|
|
||||||
splitPoints.push_back( startPos+covered );
|
|
||||||
// std::cerr << " " << ( startPos+covered );
|
|
||||||
first = 0;
|
|
||||||
found_somehing = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// std::cerr << std::endl;
|
|
||||||
parents.push_back( splitPoints );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return parents;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
|
bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
|
||||||
{
|
{
|
||||||
return GetNodes( startPos, endPos).size() > 0;
|
return GetNodes( startPos, endPos).size() > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
|
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
|
||||||
|
int startPos, int endPos ) const
|
||||||
{
|
{
|
||||||
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
|
NodeIndex::const_iterator startIndex = m_index.find( startPos );
|
||||||
if (startIndex == m_index.end() )
|
if (startIndex == m_index.end() )
|
||||||
return m_emptyNode;
|
return m_emptyNode;
|
||||||
|
|
||||||
SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
|
InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
|
||||||
if (endIndex == startIndex->second.end())
|
if (endIndex == startIndex->second.end())
|
||||||
return m_emptyNode;
|
return m_emptyNode;
|
||||||
|
|
||||||
return endIndex->second;
|
return endIndex->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SyntaxNodeCollection::ConnectNodes()
|
|
||||||
{
|
|
||||||
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
|
|
||||||
|
|
||||||
SyntaxNode *prev = 0;
|
|
||||||
// Iterate over all start indices from lowest to highest.
|
|
||||||
for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
|
||||||
const SyntaxTreeIndex2 &inner = p->second;
|
|
||||||
// Iterate over all end indices from highest to lowest.
|
|
||||||
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
|
|
||||||
const std::vector<SyntaxNode*> &nodes = q->second;
|
|
||||||
// Iterate over all nodes that cover the same span in order of tree
|
|
||||||
// depth, top-most first.
|
|
||||||
for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
|
|
||||||
r != nodes.rend(); ++r) {
|
|
||||||
SyntaxNode *node = *r;
|
|
||||||
if (!prev) {
|
|
||||||
// node is the root.
|
|
||||||
m_top = node;
|
|
||||||
node->SetParent(0);
|
|
||||||
} else if (prev->GetStart() == node->GetStart()) {
|
|
||||||
// prev is the parent of node.
|
|
||||||
assert(prev->GetEnd() >= node->GetEnd());
|
|
||||||
node->SetParent(prev);
|
|
||||||
prev->AddChild(node);
|
|
||||||
} else {
|
|
||||||
// prev is a descendant of node's parent. The lowest common
|
|
||||||
// ancestor of prev and node will be node's parent.
|
|
||||||
SyntaxNode *ancestor = prev->GetParent();
|
|
||||||
while (ancestor->GetEnd() < node->GetEnd()) {
|
|
||||||
ancestor = ancestor->GetParent();
|
|
||||||
}
|
|
||||||
assert(ancestor);
|
|
||||||
node->SetParent(ancestor);
|
|
||||||
ancestor->AddChild(node);
|
|
||||||
}
|
|
||||||
prev = node;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
||||||
{
|
{
|
||||||
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
|
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
|
||||||
@ -163,14 +81,15 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Connect the SyntaxTrees.
|
// Connect the SyntaxTrees.
|
||||||
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
|
typedef NodeIndex::const_iterator OuterIterator;
|
||||||
|
typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
|
||||||
|
|
||||||
SyntaxTree *root = 0;
|
SyntaxTree *root = 0;
|
||||||
SyntaxNode *prevNode = 0;
|
SyntaxNode *prevNode = 0;
|
||||||
SyntaxTree *prevTree = 0;
|
SyntaxTree *prevTree = 0;
|
||||||
// Iterate over all start indices from lowest to highest.
|
// Iterate over all start indices from lowest to highest.
|
||||||
for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
||||||
const SyntaxTreeIndex2 &inner = p->second;
|
const InnerNodeIndex &inner = p->second;
|
||||||
// Iterate over all end indices from highest to lowest.
|
// Iterate over all end indices from highest to lowest.
|
||||||
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
|
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
|
||||||
const std::vector<SyntaxNode*> &nodes = q->second;
|
const std::vector<SyntaxNode*> &nodes = q->second;
|
||||||
@ -184,16 +103,16 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
|||||||
// node is the root.
|
// node is the root.
|
||||||
root = tree;
|
root = tree;
|
||||||
tree->parent() = 0;
|
tree->parent() = 0;
|
||||||
} else if (prevNode->GetStart() == node->GetStart()) {
|
} else if (prevNode->start == node->start) {
|
||||||
// prevNode is the parent of node.
|
// prevNode is the parent of node.
|
||||||
assert(prevNode->GetEnd() >= node->GetEnd());
|
assert(prevNode->end >= node->end);
|
||||||
tree->parent() = prevTree;
|
tree->parent() = prevTree;
|
||||||
prevTree->children().push_back(tree);
|
prevTree->children().push_back(tree);
|
||||||
} else {
|
} else {
|
||||||
// prevNode is a descendant of node's parent. The lowest common
|
// prevNode is a descendant of node's parent. The lowest common
|
||||||
// ancestor of prevNode and node will be node's parent.
|
// ancestor of prevNode and node will be node's parent.
|
||||||
SyntaxTree *ancestor = prevTree->parent();
|
SyntaxTree *ancestor = prevTree->parent();
|
||||||
while (ancestor->value().GetEnd() < tree->value().GetEnd()) {
|
while (ancestor->value().end < tree->value().end) {
|
||||||
ancestor = ancestor->parent();
|
ancestor = ancestor->parent();
|
||||||
}
|
}
|
||||||
assert(ancestor);
|
assert(ancestor);
|
||||||
|
@ -31,49 +31,47 @@
|
|||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
{
|
{
|
||||||
|
|
||||||
typedef std::vector< int > SplitPoints;
|
/** A collection of SyntaxNodes organized by start and end position.
|
||||||
typedef std::vector< SplitPoints > ParentNodes;
|
*
|
||||||
|
*/
|
||||||
class SyntaxNodeCollection
|
class SyntaxNodeCollection
|
||||||
{
|
{
|
||||||
protected:
|
|
||||||
std::vector< SyntaxNode* > m_nodes;
|
|
||||||
SyntaxNode* m_top;
|
|
||||||
|
|
||||||
typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
|
|
||||||
typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
|
|
||||||
typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
|
|
||||||
typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
|
|
||||||
SyntaxTreeIndex m_index;
|
|
||||||
int m_size;
|
|
||||||
std::vector< SyntaxNode* > m_emptyNode;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SyntaxNodeCollection()
|
SyntaxNodeCollection() : m_numWords(0) {}
|
||||||
: m_top(0) // m_top doesn't get set unless ConnectNodes is called.
|
|
||||||
, m_size(0) {}
|
|
||||||
|
|
||||||
~SyntaxNodeCollection();
|
~SyntaxNodeCollection();
|
||||||
|
|
||||||
|
//! Construct and insert a new SyntaxNode.
|
||||||
SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
|
SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
|
||||||
|
|
||||||
SyntaxNode *GetTop() {
|
//! Return true iff there are one or more SyntaxNodes with the given span.
|
||||||
return m_top;
|
|
||||||
}
|
|
||||||
|
|
||||||
ParentNodes Parse();
|
|
||||||
bool HasNode( int startPos, int endPos ) const;
|
bool HasNode( int startPos, int endPos ) const;
|
||||||
|
|
||||||
|
//! Lookup the SyntaxNodes for a given span.
|
||||||
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
|
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
|
||||||
const std::vector< SyntaxNode* >& GetAllNodes() {
|
|
||||||
return m_nodes;
|
//! Get a vector of pointers to all SyntaxNodes (unordered).
|
||||||
};
|
const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; };
|
||||||
|
|
||||||
size_t GetNumWords() const {
|
size_t GetNumWords() const {
|
||||||
return m_size;
|
return m_numWords;
|
||||||
}
|
}
|
||||||
void ConnectNodes();
|
|
||||||
void Clear();
|
void Clear();
|
||||||
|
|
||||||
std::auto_ptr<SyntaxTree> ExtractTree();
|
std::auto_ptr<SyntaxTree> ExtractTree();
|
||||||
|
|
||||||
|
private:
|
||||||
|
typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
|
||||||
|
typedef std::map< int, InnerNodeIndex > NodeIndex;
|
||||||
|
|
||||||
|
// Not copyable.
|
||||||
|
SyntaxNodeCollection(const SyntaxNodeCollection &);
|
||||||
|
SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
|
||||||
|
|
||||||
|
std::vector< SyntaxNode* > m_nodes;
|
||||||
|
NodeIndex m_index;
|
||||||
|
int m_numWords;
|
||||||
|
std::vector< SyntaxNode* > m_emptyNode;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace MosesTraining
|
} // namespace MosesTraining
|
||||||
|
@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
|||||||
string label = ParseXmlTagAttribute(tagContent,"label");
|
string label = ParseXmlTagAttribute(tagContent,"label");
|
||||||
labelCollection.insert( label );
|
labelCollection.insert( label );
|
||||||
|
|
||||||
string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
|
|
||||||
float pcfgScore = pcfgString == "" ? 0.0f
|
|
||||||
: std::atof(pcfgString.c_str());
|
|
||||||
|
|
||||||
// report what we have processed so far
|
// report what we have processed so far
|
||||||
if (0) {
|
if (0) {
|
||||||
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
|
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
|
||||||
@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
|||||||
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
|
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
|
||||||
}
|
}
|
||||||
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
|
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
|
||||||
node->SetPcfgScore(pcfgScore);
|
|
||||||
ParseXmlTagAttributes(tagContent, node->attributes);
|
ParseXmlTagAttributes(tagContent, node->attributes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -424,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
|||||||
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
|
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
|
||||||
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
|
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
|
||||||
SyntaxNode *n = *node;
|
SyntaxNode *n = *node;
|
||||||
const string &label = n->GetLabel();
|
const string &label = n->label;
|
||||||
if (topLabelCollection.find( label ) == topLabelCollection.end())
|
if (topLabelCollection.find( label ) == topLabelCollection.end())
|
||||||
topLabelCollection[ label ] = 0;
|
topLabelCollection[ label ] = 0;
|
||||||
topLabelCollection[ label ]++;
|
topLabelCollection[ label ]++;
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cstdlib>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <stack>
|
#include <stack>
|
||||||
|
|
||||||
@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
|
|||||||
{
|
{
|
||||||
NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
|
NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
|
||||||
|
|
||||||
std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
|
std::auto_ptr<Node> n(new Node(root->value().label, nodeType));
|
||||||
|
|
||||||
if (nodeType == TREE) {
|
if (nodeType == TREE) {
|
||||||
float score = 0.0f;
|
float score = 0.0f;
|
||||||
|
@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
|
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Target label sets for producing glue grammar.
|
|
||||||
std::set<std::string> targetLabelSet;
|
|
||||||
std::map<std::string, int> targetTopLabelSet;
|
|
||||||
|
|
||||||
// Source label sets for producing glue grammar.
|
|
||||||
std::set<std::string> sourceLabelSet;
|
|
||||||
std::map<std::string, int> sourceTopLabelSet;
|
|
||||||
|
|
||||||
// Word count statistics for producing unknown word labels.
|
// Word count statistics for producing unknown word labels.
|
||||||
std::map<std::string, int> targetWordCount;
|
std::map<std::string, int> targetWordCount;
|
||||||
std::map<std::string, std::string> targetWordLabel;
|
std::map<std::string, std::string> targetWordLabel;
|
||||||
@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
std::string sourceLine;
|
std::string sourceLine;
|
||||||
std::string alignmentLine;
|
std::string alignmentLine;
|
||||||
Alignment alignment;
|
Alignment alignment;
|
||||||
Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
|
Syntax::XmlTreeParser targetXmlTreeParser;
|
||||||
Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
|
Syntax::XmlTreeParser sourceXmlTreeParser;
|
||||||
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
|
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
|
||||||
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
|
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
|
||||||
size_t lineNum = options.sentenceOffset;
|
size_t lineNum = options.sentenceOffset;
|
||||||
@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
Error(oss.str());
|
Error(oss.str());
|
||||||
}
|
}
|
||||||
sourceTokens = sourceXmlTreeParser.GetWords();
|
sourceTokens = sourceXmlTreeParser.words();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read word alignments.
|
// Read word alignments.
|
||||||
@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
|
|
||||||
// Initialize phrase orientation scoring object
|
// Initialize phrase orientation scoring object
|
||||||
PhraseOrientation phraseOrientation(sourceTokens.size(),
|
PhraseOrientation phraseOrientation(sourceTokens.size(),
|
||||||
targetXmlTreeParser.GetWords().size(), alignment);
|
targetXmlTreeParser.words().size(), alignment);
|
||||||
|
|
||||||
// Write the rules, subject to scope pruning.
|
// Write the rules, subject to scope pruning.
|
||||||
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
|
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
|
||||||
@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
// SCFG output.
|
// SCFG output.
|
||||||
ScfgRule *r = 0;
|
ScfgRule *r = 0;
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection());
|
r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
|
||||||
} else {
|
} else {
|
||||||
r = new ScfgRule(**q);
|
r = new ScfgRule(**q);
|
||||||
}
|
}
|
||||||
@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
|
|
||||||
std::map<std::string,size_t> sourceLabels;
|
std::map<std::string,size_t> sourceLabels;
|
||||||
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
|
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
|
||||||
|
std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
|
||||||
sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
|
extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
|
||||||
sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
|
extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
|
||||||
sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar
|
extendedLabelSet.insert("TOPLABEL"); // as used in the glue grammar
|
||||||
sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
|
extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
|
for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
|
||||||
iter!=sourceLabelSet.end(); ++iter, ++index) {
|
iter!=extendedLabelSet.end(); ++iter, ++index) {
|
||||||
sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
|
sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
|
||||||
}
|
}
|
||||||
WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
|
WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
|
||||||
@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
std::map<std::string, int> strippedTargetTopLabelSet;
|
std::map<std::string, int> strippedTargetTopLabelSet;
|
||||||
if (options.stripBitParLabels &&
|
if (options.stripBitParLabels &&
|
||||||
(!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
|
(!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
|
||||||
StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet);
|
StripBitParLabels(targetXmlTreeParser.label_set(),
|
||||||
|
targetXmlTreeParser.top_label_set(),
|
||||||
|
strippedTargetLabelSet, strippedTargetTopLabelSet);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!options.glueGrammarFile.empty()) {
|
if (!options.glueGrammarFile.empty()) {
|
||||||
if (options.stripBitParLabels) {
|
if (options.stripBitParLabels) {
|
||||||
WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
|
WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
|
||||||
} else {
|
} else {
|
||||||
WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
|
WriteGlueGrammar(targetXmlTreeParser.label_set(),
|
||||||
|
targetXmlTreeParser.top_label_set(),
|
||||||
|
sourceLabels, options, glueGrammarStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
if (options.stripBitParLabels) {
|
if (options.stripBitParLabels) {
|
||||||
WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
|
WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
|
||||||
} else {
|
} else {
|
||||||
WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
|
WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
|
||||||
|
unknownWordSoftMatchesStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -816,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts(
|
|||||||
for (SyntaxTree::ConstLeafIterator p(root);
|
for (SyntaxTree::ConstLeafIterator p(root);
|
||||||
p != SyntaxTree::ConstLeafIterator(); ++p) {
|
p != SyntaxTree::ConstLeafIterator(); ++p) {
|
||||||
const SyntaxTree &leaf = *p;
|
const SyntaxTree &leaf = *p;
|
||||||
const std::string &word = leaf.value().GetLabel();
|
const std::string &word = leaf.value().label;
|
||||||
const SyntaxTree *ancestor = leaf.parent();
|
const SyntaxTree *ancestor = leaf.parent();
|
||||||
// If unary rule elimination is enabled and this word is at the end of a
|
// If unary rule elimination is enabled and this word is at the end of a
|
||||||
// chain of unary rewrites, e.g.
|
// chain of unary rewrites, e.g.
|
||||||
@ -828,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts(
|
|||||||
ancestor->parent()->children().size() == 1) {
|
ancestor->parent()->children().size() == 1) {
|
||||||
ancestor = ancestor->parent();
|
ancestor = ancestor->parent();
|
||||||
}
|
}
|
||||||
const std::string &label = ancestor->value().GetLabel();
|
const std::string &label = ancestor->value().label;
|
||||||
++wordCount[word];
|
++wordCount[word];
|
||||||
wordLabel[word] = label;
|
wordLabel[word] = label;
|
||||||
}
|
}
|
||||||
@ -840,7 +837,7 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
|
|||||||
for (SyntaxTree::ConstLeafIterator p(root);
|
for (SyntaxTree::ConstLeafIterator p(root);
|
||||||
p != SyntaxTree::ConstLeafIterator(); ++p) {
|
p != SyntaxTree::ConstLeafIterator(); ++p) {
|
||||||
const SyntaxTree &leaf = *p;
|
const SyntaxTree &leaf = *p;
|
||||||
const std::string &word = leaf.value().GetLabel();
|
const std::string &word = leaf.value().label;
|
||||||
tokens.push_back(word);
|
tokens.push_back(word);
|
||||||
}
|
}
|
||||||
return tokens;
|
return tokens;
|
||||||
|
@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
|
|||||||
sourceNodeCollection->GetNodes(span.first,span.second);
|
sourceNodeCollection->GetNodes(span.first,span.second);
|
||||||
if (!sourceLabels.empty()) {
|
if (!sourceLabels.empty()) {
|
||||||
// store the topmost matching label from the source syntax tree
|
// store the topmost matching label from the source syntax tree
|
||||||
m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
|
m_sourceLabels.push_back(sourceLabels.back()->label);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// no matching source-side syntactic constituent: store nonMatchingLabel
|
// no matching source-side syntactic constituent: store nonMatchingLabel
|
||||||
|
@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
|
|||||||
void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
|
void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
|
||||||
void writeUnknownWordLabel(const string &);
|
void writeUnknownWordLabel(const string &);
|
||||||
|
|
||||||
|
double getPcfgScore(const SyntaxNode &);
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
@ -505,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
|
|||||||
|
|
||||||
int labelI = labelIndex[ 2+holeCount+holeTotal ];
|
int labelI = labelIndex[ 2+holeCount+holeTotal ];
|
||||||
string label = m_options.sourceSyntax ?
|
string label = m_options.sourceSyntax ?
|
||||||
m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
|
m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
|
||||||
hole.SetLabel(label, 0);
|
hole.SetLabel(label, 0);
|
||||||
|
|
||||||
currPos = hole.GetEnd(0);
|
currPos = hole.GetEnd(0);
|
||||||
@ -548,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
|
|||||||
int labelI = labelIndex[ 2+holeCount ];
|
int labelI = labelIndex[ 2+holeCount ];
|
||||||
string targetLabel;
|
string targetLabel;
|
||||||
if (m_options.targetSyntax) {
|
if (m_options.targetSyntax) {
|
||||||
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
|
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
|
||||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||||
targetLabel = "S";
|
targetLabel = "S";
|
||||||
} else {
|
} else {
|
||||||
@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (m_options.pcfgScore) {
|
if (m_options.pcfgScore) {
|
||||||
double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
|
logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
|
||||||
logPCFGScore -= score;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
currPos = hole.GetEnd(1);
|
currPos = hole.GetEnd(1);
|
||||||
@ -674,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
|||||||
// phrase labels
|
// phrase labels
|
||||||
string targetLabel;
|
string targetLabel;
|
||||||
if (m_options.targetSyntax) {
|
if (m_options.targetSyntax) {
|
||||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
|
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
|
||||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||||
targetLabel = "S";
|
targetLabel = "S";
|
||||||
} else {
|
} else {
|
||||||
@ -682,14 +683,14 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
|||||||
}
|
}
|
||||||
|
|
||||||
string sourceLabel = m_options.sourceSyntax ?
|
string sourceLabel = m_options.sourceSyntax ?
|
||||||
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
|
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
|
||||||
|
|
||||||
// create non-terms on the source side
|
// create non-terms on the source side
|
||||||
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
|
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
|
||||||
|
|
||||||
// target
|
// target
|
||||||
if (m_options.pcfgScore) {
|
if (m_options.pcfgScore) {
|
||||||
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
|
double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
|
||||||
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
|
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
|
||||||
+ " [" + targetLabel + "]";
|
+ " [" + targetLabel + "]";
|
||||||
rule.pcfgScore = std::exp(logPCFGScore);
|
rule.pcfgScore = std::exp(logPCFGScore);
|
||||||
@ -946,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
|
|||||||
// phrase labels
|
// phrase labels
|
||||||
string targetLabel,sourceLabel;
|
string targetLabel,sourceLabel;
|
||||||
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
|
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
|
||||||
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
|
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||||
} else {
|
} else {
|
||||||
sourceLabel = m_options.sourceSyntax ?
|
sourceLabel = m_options.sourceSyntax ?
|
||||||
m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
|
m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
|
||||||
|
|
||||||
if (m_options.targetSyntax) {
|
if (m_options.targetSyntax) {
|
||||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
|
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||||
targetLabel = "S";
|
targetLabel = "S";
|
||||||
} else {
|
} else {
|
||||||
@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
|
|||||||
rule.target += "[" + targetLabel + "]";
|
rule.target += "[" + targetLabel + "]";
|
||||||
|
|
||||||
if (m_options.pcfgScore) {
|
if (m_options.pcfgScore) {
|
||||||
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
|
double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
|
||||||
rule.pcfgScore = std::exp(logPCFGScore);
|
rule.pcfgScore = std::exp(logPCFGScore);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1165,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
|
|||||||
const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
|
const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
|
||||||
if (labels.size() > 0) {
|
if (labels.size() > 0) {
|
||||||
wordCount[ word ]++;
|
wordCount[ word ]++;
|
||||||
wordLabel[ word ] = labels[0]->GetLabel();
|
wordLabel[ word ] = labels[0]->label;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName)
|
|||||||
|
|
||||||
outFile.close();
|
outFile.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double getPcfgScore(const SyntaxNode &node)
|
||||||
|
{
|
||||||
|
double score = 0.0f;
|
||||||
|
SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
|
||||||
|
if (p != node.attributes.end()) {
|
||||||
|
score = std::atof(p->second.c_str());
|
||||||
|
}
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet(
|
|||||||
void FilterRuleTable::ReadTestSet(
|
void FilterRuleTable::ReadTestSet(
|
||||||
std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
|
std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
|
||||||
{
|
{
|
||||||
std::set<std::string> labelSet;
|
XmlTreeParser parser;
|
||||||
std::map<std::string, int> topLabelSet;
|
|
||||||
XmlTreeParser parser(labelSet, topLabelSet);
|
|
||||||
int lineNum = 0;
|
int lineNum = 0;
|
||||||
std::string line;
|
std::string line;
|
||||||
while (std::getline(input, line)) {
|
while (std::getline(input, line)) {
|
||||||
|
@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter(
|
|||||||
|
|
||||||
TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
|
TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
|
||||||
{
|
{
|
||||||
IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
|
IdTree *t = new IdTree(m_testVocab.Insert(s.value().label));
|
||||||
const std::vector<SyntaxTree*> &sChildren = s.children();
|
const std::vector<SyntaxTree*> &sChildren = s.children();
|
||||||
std::vector<IdTree*> &tChildren = t->children();
|
std::vector<IdTree*> &tChildren = t->children();
|
||||||
tChildren.reserve(sChildren.size());
|
tChildren.reserve(sChildren.size());
|
||||||
|
@ -1,79 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2012 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#ifndef PCFG_PCFG_TREE_H_
|
|
||||||
#define PCFG_PCFG_TREE_H_
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "syntax_tree.h"
|
|
||||||
#include "xml_tree_writer.h"
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
|
||||||
namespace Syntax {
|
|
||||||
namespace PCFG {
|
|
||||||
|
|
||||||
template<typename DerivedType>
|
|
||||||
class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
|
|
||||||
public:
|
|
||||||
typedef std::string LabelType;
|
|
||||||
typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
|
|
||||||
|
|
||||||
PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
|
|
||||||
|
|
||||||
double score() const { return score_; }
|
|
||||||
void set_score(double s) { score_ = s; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
double score_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class PcfgTree : public PcfgTreeBase<PcfgTree> {
|
|
||||||
public:
|
|
||||||
typedef PcfgTreeBase<PcfgTree> BaseType;
|
|
||||||
PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Specialise XmlOutputHandler for PcfgTree.
|
|
||||||
template<>
|
|
||||||
class XmlOutputHandler<PcfgTree> {
|
|
||||||
public:
|
|
||||||
typedef std::map<std::string, std::string> AttributeMap;
|
|
||||||
|
|
||||||
void GetLabel(const PcfgTree &tree, std::string &label) const {
|
|
||||||
label = tree.label();
|
|
||||||
}
|
|
||||||
|
|
||||||
void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
|
|
||||||
attribute_map.clear();
|
|
||||||
double score = tree.score();
|
|
||||||
if (score != 0.0) {
|
|
||||||
std::ostringstream out;
|
|
||||||
out << tree.score();
|
|
||||||
attribute_map["pcfg"] = out.str();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace PCFG
|
|
||||||
} // namespace Syntax
|
|
||||||
} // namespace MosesTraining
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,93 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2012 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#ifndef PCFG_SYNTAX_TREE_H_
|
|
||||||
#define PCFG_SYNTAX_TREE_H_
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
|
||||||
namespace Syntax {
|
|
||||||
namespace PCFG {
|
|
||||||
|
|
||||||
// Base class for SyntaxTree, AgreementTree, and friends.
|
|
||||||
template<typename T, typename DerivedType>
|
|
||||||
class SyntaxTreeBase {
|
|
||||||
public:
|
|
||||||
// Constructors
|
|
||||||
SyntaxTreeBase(const T &label)
|
|
||||||
: label_(label)
|
|
||||||
, children_()
|
|
||||||
, parent_(0) {}
|
|
||||||
|
|
||||||
SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
|
|
||||||
: label_(label)
|
|
||||||
, children_(children)
|
|
||||||
, parent_(0) {}
|
|
||||||
|
|
||||||
// Destructor
|
|
||||||
virtual ~SyntaxTreeBase();
|
|
||||||
|
|
||||||
const T &label() const { return label_; }
|
|
||||||
const DerivedType *parent() const { return parent_; }
|
|
||||||
DerivedType *parent() { return parent_; }
|
|
||||||
const std::vector<DerivedType *> &children() const { return children_; }
|
|
||||||
std::vector<DerivedType *> &children() { return children_; }
|
|
||||||
|
|
||||||
void set_label(const T &label) { label_ = label; }
|
|
||||||
void set_parent(DerivedType *parent) { parent_ = parent; }
|
|
||||||
void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
|
|
||||||
|
|
||||||
bool IsLeaf() const { return children_.empty(); }
|
|
||||||
|
|
||||||
bool IsPreterminal() const {
|
|
||||||
return children_.size() == 1 && children_[0]->IsLeaf();
|
|
||||||
}
|
|
||||||
|
|
||||||
void AddChild(DerivedType *child) { children_.push_back(child); }
|
|
||||||
|
|
||||||
private:
|
|
||||||
T label_;
|
|
||||||
std::vector<DerivedType *> children_;
|
|
||||||
DerivedType *parent_;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
|
|
||||||
public:
|
|
||||||
typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
|
|
||||||
SyntaxTree(const T &label) : BaseType(label) {}
|
|
||||||
SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
|
|
||||||
: BaseType(label, children) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename T, typename DerivedType>
|
|
||||||
SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
|
|
||||||
for (std::size_t i = 0; i < children_.size(); ++i) {
|
|
||||||
delete children_[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace PCFG
|
|
||||||
} // namespace Syntax
|
|
||||||
} // namespace MosesTraining
|
|
||||||
|
|
||||||
#endif
|
|
@ -24,7 +24,6 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "syntax-common/numbered_set.h"
|
#include "syntax-common/numbered_set.h"
|
||||||
#include "syntax_tree.h"
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
namespace MosesTraining {
|
||||||
namespace Syntax {
|
namespace Syntax {
|
||||||
|
@ -1,89 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2012 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#include "xml_tree_parser.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "tables-core.h"
|
|
||||||
#include "XmlException.h"
|
|
||||||
#include "XmlTree.h"
|
|
||||||
#include "util/tokenize.hh"
|
|
||||||
|
|
||||||
#include "syntax-common/exception.h"
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
|
||||||
namespace Syntax {
|
|
||||||
namespace PCFG {
|
|
||||||
|
|
||||||
XmlTreeParser::XmlTreeParser() {
|
|
||||||
}
|
|
||||||
|
|
||||||
std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
|
|
||||||
m_line = line;
|
|
||||||
m_tree.Clear();
|
|
||||||
try {
|
|
||||||
if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
|
|
||||||
throw Exception("");
|
|
||||||
}
|
|
||||||
} catch (const XmlException &e) {
|
|
||||||
throw Exception(e.getMsg());
|
|
||||||
}
|
|
||||||
m_tree.ConnectNodes();
|
|
||||||
SyntaxNode *root = m_tree.GetTop();
|
|
||||||
if (!root) {
|
|
||||||
// There is no XML tree.
|
|
||||||
return std::auto_ptr<PcfgTree>();
|
|
||||||
}
|
|
||||||
m_words = util::tokenize(m_line);
|
|
||||||
return ConvertTree(*root, m_words);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
|
|
||||||
std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
|
|
||||||
const SyntaxNode &tree,
|
|
||||||
const std::vector<std::string> &words) {
|
|
||||||
std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
|
|
||||||
const std::vector<SyntaxNode*> &children = tree.GetChildren();
|
|
||||||
if (children.empty()) {
|
|
||||||
if (tree.GetStart() != tree.GetEnd()) {
|
|
||||||
std::ostringstream msg;
|
|
||||||
msg << "leaf node covers multiple words (" << tree.GetStart()
|
|
||||||
<< "-" << tree.GetEnd() << "): this is currently unsupported";
|
|
||||||
throw Exception(msg.str());
|
|
||||||
}
|
|
||||||
std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
|
|
||||||
leaf->set_parent(root.get());
|
|
||||||
root->AddChild(leaf.release());
|
|
||||||
} else {
|
|
||||||
for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
|
|
||||||
p != children.end(); ++p) {
|
|
||||||
assert(*p);
|
|
||||||
std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
|
|
||||||
child->set_parent(root.get());
|
|
||||||
root->AddChild(child.release());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return root;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace PCFG
|
|
||||||
} // namespace Syntax
|
|
||||||
} // namespace MosesTraining
|
|
@ -1,59 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2012 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#ifndef PCFG_XML_TREE_PARSER_H_
|
|
||||||
#define PCFG_XML_TREE_PARSER_H_
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <memory>
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "pcfg_tree.h"
|
|
||||||
#include "SyntaxNode.h"
|
|
||||||
#include "SyntaxNodeCollection.h"
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
|
||||||
namespace Syntax {
|
|
||||||
namespace PCFG {
|
|
||||||
|
|
||||||
// Parses a string in Moses' XML parse tree format and returns a PcfgTree
|
|
||||||
// object.
|
|
||||||
class XmlTreeParser {
|
|
||||||
public:
|
|
||||||
XmlTreeParser();
|
|
||||||
std::auto_ptr<PcfgTree> Parse(const std::string &);
|
|
||||||
private:
|
|
||||||
std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
|
|
||||||
const std::vector<std::string> &);
|
|
||||||
|
|
||||||
std::set<std::string> m_labelSet;
|
|
||||||
std::map<std::string, int> m_topLabelSet;
|
|
||||||
std::string m_line;
|
|
||||||
MosesTraining::SyntaxNodeCollection m_tree;
|
|
||||||
std::vector<std::string> m_words;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace PCFG
|
|
||||||
} // namespace Syntax
|
|
||||||
} // namespace MosesTraining
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,135 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2012 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#ifndef PCFG_XML_TREE_WRITER_H_
|
|
||||||
#define PCFG_XML_TREE_WRITER_H_
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <map>
|
|
||||||
#include <memory>
|
|
||||||
#include <ostream>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "XmlTree.h"
|
|
||||||
|
|
||||||
#include "syntax_tree.h"
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
|
||||||
namespace Syntax {
|
|
||||||
namespace PCFG {
|
|
||||||
|
|
||||||
template<typename InputTree>
|
|
||||||
class XmlOutputHandler {
|
|
||||||
public:
|
|
||||||
typedef std::map<std::string, std::string> AttributeMap;
|
|
||||||
|
|
||||||
void GetLabel(const InputTree &, std::string &) const;
|
|
||||||
void GetAttributes(const InputTree &, AttributeMap &) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename InputTree>
|
|
||||||
class XmlTreeWriter : public XmlOutputHandler<InputTree> {
|
|
||||||
public:
|
|
||||||
typedef XmlOutputHandler<InputTree> Base;
|
|
||||||
void Write(const InputTree &, std::ostream &) const;
|
|
||||||
private:
|
|
||||||
std::string Escape(const std::string &) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename InputTree>
|
|
||||||
void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
|
|
||||||
std::ostream &out) const {
|
|
||||||
assert(!tree.IsLeaf());
|
|
||||||
|
|
||||||
// Opening tag
|
|
||||||
|
|
||||||
std::string label;
|
|
||||||
Base::GetLabel(tree, label);
|
|
||||||
out << "<tree label=\"" << Escape(label) << "\"";
|
|
||||||
|
|
||||||
typename Base::AttributeMap attribute_map;
|
|
||||||
Base::GetAttributes(tree, attribute_map);
|
|
||||||
|
|
||||||
for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
|
|
||||||
p != attribute_map.end(); ++p) {
|
|
||||||
out << " " << p->first << "=\"" << p->second << "\"";
|
|
||||||
}
|
|
||||||
|
|
||||||
out << ">";
|
|
||||||
|
|
||||||
// Children
|
|
||||||
|
|
||||||
const std::vector<InputTree *> &children = tree.children();
|
|
||||||
for (typename std::vector<InputTree *>::const_iterator p = children.begin();
|
|
||||||
p != children.end(); ++p) {
|
|
||||||
InputTree &child = **p;
|
|
||||||
if (child.IsLeaf()) {
|
|
||||||
Base::GetLabel(child, label);
|
|
||||||
out << " " << Escape(label);
|
|
||||||
} else {
|
|
||||||
out << " ";
|
|
||||||
Write(**p, out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Closing tag
|
|
||||||
out << " </tree>";
|
|
||||||
|
|
||||||
if (tree.parent() == 0) {
|
|
||||||
out << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Escapes XML special characters.
|
|
||||||
template<typename InputTree>
|
|
||||||
std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
|
|
||||||
std::string t;
|
|
||||||
std::size_t len = s.size();
|
|
||||||
t.reserve(len);
|
|
||||||
for (std::size_t i = 0; i < len; ++i) {
|
|
||||||
if (s[i] == '<') {
|
|
||||||
t += "<";
|
|
||||||
} else if (s[i] == '>') {
|
|
||||||
t += ">";
|
|
||||||
} else if (s[i] == '[') {
|
|
||||||
t += "[";
|
|
||||||
} else if (s[i] == ']') {
|
|
||||||
t += "]";
|
|
||||||
} else if (s[i] == '|') {
|
|
||||||
t += "|";
|
|
||||||
} else if (s[i] == '&') {
|
|
||||||
t += "&";
|
|
||||||
} else if (s[i] == '\'') {
|
|
||||||
t += "'";
|
|
||||||
} else if (s[i] == '"') {
|
|
||||||
t += """;
|
|
||||||
} else {
|
|
||||||
t += s[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace PCFG
|
|
||||||
} // namespace Syntax
|
|
||||||
} // namespace MosesTraining
|
|
||||||
|
|
||||||
#endif
|
|
@ -1 +1 @@
|
|||||||
exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
|
exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : <include>.. ;
|
||||||
|
@ -19,20 +19,6 @@
|
|||||||
|
|
||||||
#include "pcfg_extract.h"
|
#include "pcfg_extract.h"
|
||||||
|
|
||||||
#include "options.h"
|
|
||||||
#include "rule_collection.h"
|
|
||||||
#include "rule_extractor.h"
|
|
||||||
|
|
||||||
#include "syntax-common/exception.h"
|
|
||||||
|
|
||||||
#include "pcfg-common/pcfg.h"
|
|
||||||
#include "pcfg-common/pcfg_tree.h"
|
|
||||||
#include "pcfg-common/syntax_tree.h"
|
|
||||||
#include "pcfg-common/typedef.h"
|
|
||||||
#include "pcfg-common/xml_tree_parser.h"
|
|
||||||
|
|
||||||
#include <boost/program_options.hpp>
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -43,6 +29,20 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include "syntax-common/exception.h"
|
||||||
|
#include "syntax-common/xml_tree_parser.h"
|
||||||
|
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
|
||||||
|
#include "pcfg-common/pcfg.h"
|
||||||
|
#include "pcfg-common/typedef.h"
|
||||||
|
|
||||||
|
#include "options.h"
|
||||||
|
#include "rule_collection.h"
|
||||||
|
#include "rule_extractor.h"
|
||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
{
|
{
|
||||||
namespace Syntax
|
namespace Syntax
|
||||||
@ -63,7 +63,7 @@ int PcfgExtract::Main(int argc, char *argv[])
|
|||||||
XmlTreeParser parser;
|
XmlTreeParser parser;
|
||||||
std::string line;
|
std::string line;
|
||||||
std::size_t line_num = 0;
|
std::size_t line_num = 0;
|
||||||
std::auto_ptr<PcfgTree> tree;
|
std::auto_ptr<MosesTraining::SyntaxTree> tree;
|
||||||
while (std::getline(std::cin, line)) {
|
while (std::getline(std::cin, line)) {
|
||||||
++line_num;
|
++line_num;
|
||||||
try {
|
try {
|
||||||
|
@ -19,8 +19,6 @@
|
|||||||
|
|
||||||
#include "rule_extractor.h"
|
#include "rule_extractor.h"
|
||||||
|
|
||||||
#include "pcfg-common/pcfg_tree.h"
|
|
||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
{
|
{
|
||||||
namespace Syntax
|
namespace Syntax
|
||||||
@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const
|
void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
|
||||||
{
|
{
|
||||||
if (tree.IsPreterminal() || tree.IsLeaf()) {
|
if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t lhs = non_term_vocab_.Insert(tree.label());
|
std::size_t lhs = non_term_vocab_.Insert(tree.value().label);
|
||||||
std::vector<std::size_t> rhs;
|
std::vector<std::size_t> rhs;
|
||||||
|
|
||||||
const std::vector<PcfgTree *> &children = tree.children();
|
const std::vector<SyntaxTree *> &children = tree.children();
|
||||||
rhs.reserve(children.size());
|
rhs.reserve(children.size());
|
||||||
for (std::vector<PcfgTree *>::const_iterator p(children.begin());
|
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
|
||||||
p != children.end(); ++p) {
|
p != children.end(); ++p) {
|
||||||
const PcfgTree &child = **p;
|
const SyntaxTree &child = **p;
|
||||||
rhs.push_back(non_term_vocab_.Insert(child.label()));
|
rhs.push_back(non_term_vocab_.Insert(child.value().label));
|
||||||
Extract(child, rc);
|
Extract(child, rc);
|
||||||
}
|
}
|
||||||
rc.Add(lhs, rhs);
|
rc.Add(lhs, rhs);
|
||||||
|
@ -21,6 +21,8 @@
|
|||||||
#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
|
#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
|
||||||
#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
|
#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
|
||||||
|
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
|
||||||
#include "pcfg-common/typedef.h"
|
#include "pcfg-common/typedef.h"
|
||||||
|
|
||||||
#include "rule_collection.h"
|
#include "rule_collection.h"
|
||||||
@ -32,14 +34,12 @@ namespace Syntax
|
|||||||
namespace PCFG
|
namespace PCFG
|
||||||
{
|
{
|
||||||
|
|
||||||
class PcfgTree;
|
|
||||||
|
|
||||||
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
|
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
|
||||||
class RuleExtractor
|
class RuleExtractor
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
RuleExtractor(Vocabulary &);
|
RuleExtractor(Vocabulary &);
|
||||||
void Extract(const PcfgTree &, RuleCollection &) const;
|
void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const;
|
||||||
private:
|
private:
|
||||||
Vocabulary &non_term_vocab_;
|
Vocabulary &non_term_vocab_;
|
||||||
};
|
};
|
||||||
|
@ -33,13 +33,14 @@
|
|||||||
|
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
|
||||||
#include "syntax-common/exception.h"
|
#include "syntax-common/exception.h"
|
||||||
|
#include "syntax-common/xml_tree_parser.h"
|
||||||
|
#include "syntax-common/xml_tree_writer.h"
|
||||||
|
|
||||||
#include "pcfg-common/pcfg.h"
|
#include "pcfg-common/pcfg.h"
|
||||||
#include "pcfg-common/pcfg_tree.h"
|
|
||||||
#include "pcfg-common/syntax_tree.h"
|
|
||||||
#include "pcfg-common/typedef.h"
|
#include "pcfg-common/typedef.h"
|
||||||
#include "pcfg-common/xml_tree_parser.h"
|
|
||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
{
|
{
|
||||||
@ -66,14 +67,14 @@ int PcfgScore::Main(int argc, char *argv[])
|
|||||||
// Score corpus according to PCFG.
|
// Score corpus according to PCFG.
|
||||||
TreeScorer scorer(pcfg, non_term_vocab);
|
TreeScorer scorer(pcfg, non_term_vocab);
|
||||||
XmlTreeParser parser;
|
XmlTreeParser parser;
|
||||||
XmlTreeWriter<PcfgTree> writer;
|
XmlTreeWriter writer(std::cout);
|
||||||
std::string line;
|
std::string line;
|
||||||
std::size_t line_num = 0;
|
std::size_t line_num = 0;
|
||||||
std::auto_ptr<PcfgTree> tree;
|
std::auto_ptr<SyntaxTree> tree;
|
||||||
while (std::getline(std::cin, line)) {
|
while (std::getline(std::cin, line)) {
|
||||||
++line_num;
|
++line_num;
|
||||||
try {
|
try {
|
||||||
tree = parser.Parse(line);
|
tree = parser.Parse(line, true);
|
||||||
} catch (Exception &e) {
|
} catch (Exception &e) {
|
||||||
std::ostringstream msg;
|
std::ostringstream msg;
|
||||||
msg << "line " << line_num << ": " << e.msg();
|
msg << "line " << line_num << ": " << e.msg();
|
||||||
@ -93,7 +94,7 @@ int PcfgScore::Main(int argc, char *argv[])
|
|||||||
std::cout << line << std::endl;
|
std::cout << line << std::endl;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
writer.Write(*tree, std::cout);
|
writer.Write(*tree);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
#include "tree_scorer.h"
|
#include "tree_scorer.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
{
|
{
|
||||||
@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TreeScorer::Score(PcfgTree &root) const
|
bool TreeScorer::Score(SyntaxTree &root)
|
||||||
{
|
{
|
||||||
if (root.IsPreterminal() || root.IsLeaf()) {
|
scores_.clear();
|
||||||
|
ZeroScores(root);
|
||||||
|
if (!CalcScores(root)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
SetAttributes(root);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TreeScorer::CalcScores(SyntaxTree &root)
|
||||||
|
{
|
||||||
|
if (root.IsLeaf() || root.children()[0]->IsLeaf()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<PcfgTree *> &children = root.children();
|
const std::vector<SyntaxTree *> &children = root.children();
|
||||||
|
|
||||||
double log_prob = 0.0;
|
double log_prob = 0.0;
|
||||||
|
|
||||||
std::vector<std::size_t> key;
|
std::vector<std::size_t> key;
|
||||||
key.reserve(children.size()+1);
|
key.reserve(children.size()+1);
|
||||||
key.push_back(non_term_vocab_.Lookup(root.label()));
|
key.push_back(non_term_vocab_.Lookup(root.value().label));
|
||||||
|
|
||||||
for (std::vector<PcfgTree *>::const_iterator p(children.begin());
|
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
|
||||||
p != children.end(); ++p) {
|
p != children.end(); ++p) {
|
||||||
PcfgTree *child = *p;
|
SyntaxTree *child = *p;
|
||||||
assert(!child->IsLeaf());
|
assert(!child->IsLeaf());
|
||||||
key.push_back(non_term_vocab_.Lookup(child->label()));
|
key.push_back(non_term_vocab_.Lookup(child->value().label));
|
||||||
if (!Score(*child)) {
|
if (!CalcScores(*child)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!child->IsPreterminal()) {
|
if (!child->children()[0]->IsLeaf()) {
|
||||||
log_prob += child->score();
|
log_prob += scores_[child];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double rule_score;
|
double rule_score;
|
||||||
@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
log_prob += rule_score;
|
log_prob += rule_score;
|
||||||
root.set_score(log_prob);
|
scores_[&root] = log_prob;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TreeScorer::SetAttributes(SyntaxTree &root)
|
||||||
|
{
|
||||||
|
// Terminals don't need attributes.
|
||||||
|
if (root.IsLeaf()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Preterminals don't need attributes (they have the implicit score 0.0).
|
||||||
|
if (root.children()[0]->IsLeaf()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
double score = scores_[&root];
|
||||||
|
if (score != 0.0) {
|
||||||
|
std::ostringstream out;
|
||||||
|
out << score;
|
||||||
|
root.value().attributes["pcfg"] = out.str();
|
||||||
|
}
|
||||||
|
for (std::vector<SyntaxTree *>::const_iterator p(root.children().begin());
|
||||||
|
p != root.children().end(); ++p) {
|
||||||
|
SetAttributes(**p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TreeScorer::ZeroScores(SyntaxTree &root)
|
||||||
|
{
|
||||||
|
scores_[&root] = 0.0f;
|
||||||
|
const std::vector<SyntaxTree *> &children = root.children();
|
||||||
|
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
|
||||||
|
p != children.end(); ++p) {
|
||||||
|
ZeroScores(**p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace PCFG
|
} // namespace PCFG
|
||||||
} // namespace Syntax
|
} // namespace Syntax
|
||||||
} // namespace MosesTraining
|
} // namespace MosesTraining
|
||||||
|
@ -21,8 +21,9 @@
|
|||||||
#ifndef PCFG_SCORE_TREE_SCORER_H_
|
#ifndef PCFG_SCORE_TREE_SCORER_H_
|
||||||
#define PCFG_SCORE_TREE_SCORER_H_
|
#define PCFG_SCORE_TREE_SCORER_H_
|
||||||
|
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
|
||||||
#include "pcfg-common/pcfg.h"
|
#include "pcfg-common/pcfg.h"
|
||||||
#include "pcfg-common/pcfg_tree.h"
|
|
||||||
#include "pcfg-common/typedef.h"
|
#include "pcfg-common/typedef.h"
|
||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
@ -39,11 +40,16 @@ public:
|
|||||||
|
|
||||||
// Score tree according to PCFG. Returns false if unsuccessful (due to
|
// Score tree according to PCFG. Returns false if unsuccessful (due to
|
||||||
// missing rule).
|
// missing rule).
|
||||||
bool Score(PcfgTree &) const;
|
bool Score(SyntaxTree &);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const Pcfg &pcfg_;
|
const Pcfg &pcfg_;
|
||||||
const Vocabulary &non_term_vocab_;
|
const Vocabulary &non_term_vocab_;
|
||||||
|
std::map<SyntaxTree *, double> scores_;
|
||||||
|
|
||||||
|
bool CalcScores(SyntaxTree &);
|
||||||
|
void SetAttributes(SyntaxTree &);
|
||||||
|
void ZeroScores(SyntaxTree &);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace PCFG
|
} // namespace PCFG
|
||||||
|
@ -50,7 +50,7 @@ int main(int argc, char* argv[])
|
|||||||
// output tree
|
// output tree
|
||||||
// cerr << "BEFORE:" << endl << tree;
|
// cerr << "BEFORE:" << endl << tree;
|
||||||
|
|
||||||
ParentNodes parents = tree.Parse();
|
ParentNodes parents = determineSplitPoints(tree);
|
||||||
|
|
||||||
// execute selected grammar relaxation schemes
|
// execute selected grammar relaxation schemes
|
||||||
if (leftBinarizeFlag)
|
if (leftBinarizeFlag)
|
||||||
@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
|
|||||||
// output tree nodes
|
// output tree nodes
|
||||||
vector< SyntaxNode* > nodes = tree.GetAllNodes();
|
vector< SyntaxNode* > nodes = tree.GetAllNodes();
|
||||||
for( size_t i=0; i<nodes.size(); i++ ) {
|
for( size_t i=0; i<nodes.size(); i++ ) {
|
||||||
cout << " <tree span=\"" << nodes[i]->GetStart()
|
cout << " <tree span=\"" << nodes[i]->start
|
||||||
<< "-" << nodes[i]->GetEnd()
|
<< "-" << nodes[i]->end
|
||||||
<< "\" label=\"" << nodes[i]->GetLabel()
|
<< "\" label=\"" << nodes[i]->label
|
||||||
<< "\"/>";
|
<< "\"/>";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
if (point.size() > 3) {
|
if (point.size() > 3) {
|
||||||
const vector< SyntaxNode* >& topNodes
|
const vector< SyntaxNode* >& topNodes
|
||||||
= tree.GetNodes( point[0], point[point.size()-1]-1);
|
= tree.GetNodes( point[0], point[point.size()-1]-1);
|
||||||
string topLabel = topNodes[0]->GetLabel();
|
string topLabel = topNodes[0]->label;
|
||||||
|
|
||||||
for(size_t i=2; i<point.size()-1; i++) {
|
for(size_t i=2; i<point.size()-1; i++) {
|
||||||
// cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
|
// cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
|
||||||
@ -151,7 +151,7 @@ void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
int endPoint = point[point.size()-1]-1;
|
int endPoint = point[point.size()-1]-1;
|
||||||
const vector< SyntaxNode* >& topNodes
|
const vector< SyntaxNode* >& topNodes
|
||||||
= tree.GetNodes( point[0], endPoint);
|
= tree.GetNodes( point[0], endPoint);
|
||||||
string topLabel = topNodes[0]->GetLabel();
|
string topLabel = topNodes[0]->label;
|
||||||
|
|
||||||
for(size_t i=1; i<point.size()-2; i++) {
|
for(size_t i=1; i<point.size()-2; i++) {
|
||||||
// cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
|
// cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
|
||||||
@ -178,29 +178,29 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
// cerr << endl;
|
// cerr << endl;
|
||||||
|
|
||||||
for(size_t i = 0; i+2 < point.size(); i++) {
|
for(size_t i = 0; i+2 < point.size(); i++) {
|
||||||
// cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
|
// cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl;
|
||||||
|
|
||||||
newTree.AddNode( point[i],point[i+2]-1,
|
newTree.AddNode( point[i],point[i+2]-1,
|
||||||
tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel()
|
tree.GetNodes(point[i ],point[i+1]-1)[0]->label
|
||||||
+ "+" +
|
+ "+" +
|
||||||
tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
|
tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (point.size() >= 4) {
|
if (point.size() >= 4) {
|
||||||
int ps = point.size();
|
int ps = point.size();
|
||||||
string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
|
string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
|
||||||
|
|
||||||
// cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
|
// cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl;
|
||||||
newTree.AddNode( point[1],point[ps-1]-1,
|
newTree.AddNode( point[1],point[ps-1]-1,
|
||||||
topLabel
|
topLabel
|
||||||
+ "\\" +
|
+ "\\" +
|
||||||
tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
|
tree.GetNodes(point[0],point[1]-1)[0]->label );
|
||||||
|
|
||||||
// cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
|
// cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl;
|
||||||
newTree.AddNode( point[0],point[ps-2]-1,
|
newTree.AddNode( point[0],point[ps-2]-1,
|
||||||
topLabel
|
topLabel
|
||||||
+ "/" +
|
+ "/" +
|
||||||
tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
|
tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
|
|
||||||
for(int mid=start+1; mid<=end && !done; mid++) {
|
for(int mid=start+1; mid<=end && !done; mid++) {
|
||||||
if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
|
if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
|
||||||
// cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl;
|
// cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid, end )[0]->label << endl;
|
||||||
|
|
||||||
newTree.AddNode( start, end,
|
newTree.AddNode( start, end,
|
||||||
tree.GetNodes(start,mid-1)[0]->GetLabel()
|
tree.GetNodes(start,mid-1)[0]->label
|
||||||
+ "++" +
|
+ "++" +
|
||||||
tree.GetNodes(mid, end )[0]->GetLabel() );
|
tree.GetNodes(mid, end )[0]->label );
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
|
for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
|
||||||
if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
|
if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
|
||||||
newTree.AddNode( start, end,
|
newTree.AddNode( start, end,
|
||||||
tree.GetNodes(start,postEnd)[0]->GetLabel()
|
tree.GetNodes(start,postEnd)[0]->label
|
||||||
+ "//" +
|
+ "//" +
|
||||||
tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
|
tree.GetNodes(end+1,postEnd)[0]->label );
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
// if matching a constituent A left-minus constituent B: use A\\B
|
// if matching a constituent A left-minus constituent B: use A\\B
|
||||||
for(int preStart=start-1; preStart>=0; preStart--) {
|
for(int preStart=start-1; preStart>=0; preStart--) {
|
||||||
if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
|
if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
|
||||||
// cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
|
// cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->label << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->label << endl;
|
||||||
newTree.AddNode( start, end,
|
newTree.AddNode( start, end,
|
||||||
tree.GetNodes(preStart,end )[0]->GetLabel()
|
tree.GetNodes(preStart,end )[0]->label
|
||||||
+ "\\\\" +
|
+ "\\\\" +
|
||||||
tree.GetNodes(preStart,start-1)[0]->GetLabel() );
|
tree.GetNodes(preStart,start-1)[0]->label );
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -268,6 +268,48 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
|||||||
// adding all new nodes
|
// adding all new nodes
|
||||||
vector< SyntaxNode* > nodes = newTree.GetAllNodes();
|
vector< SyntaxNode* > nodes = newTree.GetAllNodes();
|
||||||
for( size_t i=0; i<nodes.size(); i++ ) {
|
for( size_t i=0; i<nodes.size(); i++ ) {
|
||||||
tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
|
tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
|
||||||
|
{
|
||||||
|
ParentNodes parents;
|
||||||
|
|
||||||
|
const std::size_t numWords = nodeColl.GetNumWords();
|
||||||
|
|
||||||
|
// looping through all spans of size >= 2
|
||||||
|
for( int length=2; length<=numWords; length++ ) {
|
||||||
|
for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
|
||||||
|
if (nodeColl.HasNode( startPos, startPos+length-1 )) {
|
||||||
|
// processing one (parent) span
|
||||||
|
|
||||||
|
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
|
||||||
|
SplitPoints splitPoints;
|
||||||
|
splitPoints.push_back( startPos );
|
||||||
|
//std::cerr << " " << startPos;
|
||||||
|
|
||||||
|
int first = 1;
|
||||||
|
int covered = 0;
|
||||||
|
int found_somehing = 1; // break loop if nothing found
|
||||||
|
while( covered < length && found_somehing ) {
|
||||||
|
// find largest covering subspan (child)
|
||||||
|
// starting at last covered position
|
||||||
|
found_somehing = 0;
|
||||||
|
for( int midPos=length-first; midPos>covered; midPos-- ) {
|
||||||
|
if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
|
||||||
|
covered = midPos;
|
||||||
|
splitPoints.push_back( startPos+covered );
|
||||||
|
// std::cerr << " " << ( startPos+covered );
|
||||||
|
first = 0;
|
||||||
|
found_somehing = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// std::cerr << std::endl;
|
||||||
|
parents.push_back( splitPoints );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return parents;
|
||||||
|
}
|
||||||
|
@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
|
|||||||
bool rightBinarizeFlag = false;
|
bool rightBinarizeFlag = false;
|
||||||
char SAMTLevel = 0;
|
char SAMTLevel = 0;
|
||||||
|
|
||||||
|
typedef std::vector< int > SplitPoints;
|
||||||
|
typedef std::vector< SplitPoints > ParentNodes;
|
||||||
|
|
||||||
// functions
|
// functions
|
||||||
void init(int argc, char* argv[]);
|
void init(int argc, char* argv[]);
|
||||||
|
ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
|
||||||
void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
|
void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
|
||||||
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
|
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
|
||||||
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
|
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
|
||||||
void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
|
void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
|
||||||
|
|
||||||
|
@ -10,30 +10,26 @@
|
|||||||
#include "XmlException.h"
|
#include "XmlException.h"
|
||||||
#include "XmlTree.h"
|
#include "XmlTree.h"
|
||||||
|
|
||||||
|
#include "exception.h"
|
||||||
|
|
||||||
namespace MosesTraining {
|
namespace MosesTraining {
|
||||||
namespace Syntax {
|
namespace Syntax {
|
||||||
|
|
||||||
XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
|
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
|
||||||
std::map<std::string, int> &topLabelSet)
|
bool unescape)
|
||||||
: label_set_(labelSet)
|
|
||||||
, top_label_set_(topLabelSet)
|
|
||||||
{
|
{
|
||||||
}
|
sentence_ = line;
|
||||||
|
|
||||||
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
|
|
||||||
{
|
|
||||||
line_ = line;
|
|
||||||
node_collection_.Clear();
|
node_collection_.Clear();
|
||||||
try {
|
try {
|
||||||
if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
|
if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_,
|
||||||
top_label_set_, false)) {
|
top_label_set_, unescape)) {
|
||||||
throw Exception("");
|
throw Exception("");
|
||||||
}
|
}
|
||||||
} catch (const XmlException &e) {
|
} catch (const XmlException &e) {
|
||||||
throw Exception(e.getMsg());
|
throw Exception(e.getMsg());
|
||||||
}
|
}
|
||||||
std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
|
std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
|
||||||
words_ = util::tokenize(line_);
|
words_ = util::tokenize(sentence_);
|
||||||
AttachWords(words_, *root);
|
AttachWords(words_, *root);
|
||||||
return root;
|
return root;
|
||||||
}
|
}
|
||||||
@ -51,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
|
|||||||
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
|
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
|
||||||
++p) {
|
++p) {
|
||||||
SyntaxTree *leaf = *p;
|
SyntaxTree *leaf = *p;
|
||||||
const int start = leaf->value().GetStart();
|
const int start = leaf->value().start;
|
||||||
const int end = leaf->value().GetEnd();
|
const int end = leaf->value().end;
|
||||||
if (start != end) {
|
if (start != end) {
|
||||||
std::ostringstream msg;
|
std::ostringstream msg;
|
||||||
msg << "leaf node covers multiple words (" << start << "-" << end
|
msg << "leaf node covers multiple words (" << start << "-" << end
|
||||||
<< "): this is currently unsupported";
|
<< "): this is currently unsupported";
|
||||||
throw Exception(msg.str());
|
throw Exception(msg.str());
|
||||||
}
|
}
|
||||||
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
|
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end));
|
||||||
leaf->children().push_back(newLeaf);
|
leaf->children().push_back(newLeaf);
|
||||||
newLeaf->parent() = leaf;
|
newLeaf->parent() = leaf;
|
||||||
}
|
}
|
||||||
|
@ -6,39 +6,52 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "SyntaxNode.h"
|
|
||||||
#include "SyntaxNodeCollection.h"
|
#include "SyntaxNodeCollection.h"
|
||||||
#include "SyntaxTree.h"
|
#include "SyntaxTree.h"
|
||||||
|
|
||||||
#include "exception.h"
|
|
||||||
|
|
||||||
namespace MosesTraining {
|
namespace MosesTraining {
|
||||||
namespace Syntax {
|
namespace Syntax {
|
||||||
|
|
||||||
// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
|
/** Parses string representations of parse trees in Moses' XML format and
|
||||||
// object. This is a wrapper around the ProcessAndStripXMLTags function.
|
* converts them to SyntaxTree objects.
|
||||||
|
*
|
||||||
|
* This is a thin wrapper around the ProcessAndStripXMLTags function. After
|
||||||
|
* calling Parse(), the output of the ProcessAndStripXMLTags function (the
|
||||||
|
* sentence, node collection, label set, and top label set) are available via
|
||||||
|
* accessors.
|
||||||
|
*/
|
||||||
class XmlTreeParser {
|
class XmlTreeParser {
|
||||||
public:
|
public:
|
||||||
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
|
//! Parse a single sentence and return a SyntaxTree (with words attached).
|
||||||
|
std::auto_ptr<SyntaxTree> Parse(const std::string &, bool unescape=false);
|
||||||
|
|
||||||
std::auto_ptr<SyntaxTree> Parse(const std::string &);
|
//! Get the sentence string (as returned by ProcessAndStripXMLTags).
|
||||||
|
const std::string &sentence() const { return sentence_; }
|
||||||
|
|
||||||
const std::vector<std::string>& GetWords() {
|
//! Get the sentence as a vector of words.
|
||||||
return words_;
|
const std::vector<std::string> &words() const { return words_; }
|
||||||
}
|
|
||||||
|
|
||||||
const SyntaxNodeCollection &GetNodeCollection() const {
|
//! Get the node collection (as returned by ProcessAndStripXMLTags).
|
||||||
|
const SyntaxNodeCollection &node_collection() const {
|
||||||
return node_collection_;
|
return node_collection_;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
//! Get the label set (as returned by ProcessAndStripXMLTags).
|
||||||
std::set<std::string> &label_set_;
|
const std::set<std::string> &label_set() const { return label_set_; }
|
||||||
std::map<std::string, int> &top_label_set_;
|
|
||||||
std::string line_;
|
|
||||||
SyntaxNodeCollection node_collection_;
|
|
||||||
std::vector<std::string> words_;
|
|
||||||
|
|
||||||
|
//! Get the top label set (as returned by ProcessAndStripXMLTags).
|
||||||
|
const std::map<std::string, int> &top_label_set() const {
|
||||||
|
return top_label_set_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
void AttachWords(const std::vector<std::string> &, SyntaxTree &);
|
void AttachWords(const std::vector<std::string> &, SyntaxTree &);
|
||||||
|
|
||||||
|
std::string sentence_;
|
||||||
|
SyntaxNodeCollection node_collection_;
|
||||||
|
std::set<std::string> label_set_;
|
||||||
|
std::map<std::string, int> top_label_set_;
|
||||||
|
std::vector<std::string> words_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace Syntax
|
} // namespace Syntax
|
||||||
|
82
phrase-extract/syntax-common/xml_tree_writer.cc
Normal file
82
phrase-extract/syntax-common/xml_tree_writer.cc
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
#include "xml_tree_writer.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <ostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
#include "XmlTree.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace MosesTraining {
|
||||||
|
namespace Syntax {
|
||||||
|
|
||||||
|
void XmlTreeWriter::Write(const SyntaxTree &tree) const {
|
||||||
|
assert(!tree.IsLeaf());
|
||||||
|
|
||||||
|
// Opening tag
|
||||||
|
out_ << "<tree label=\"" << Escape(tree.value().label) << "\"";
|
||||||
|
for (SyntaxNode::AttributeMap::const_iterator
|
||||||
|
p = tree.value().attributes.begin();
|
||||||
|
p != tree.value().attributes.end(); ++p) {
|
||||||
|
if (p->first != "label") {
|
||||||
|
out_ << " " << p->first << "=\"" << p->second << "\"";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out_ << ">";
|
||||||
|
|
||||||
|
// Children
|
||||||
|
for (std::vector<SyntaxTree *>::const_iterator p = tree.children().begin();
|
||||||
|
p != tree.children().end(); ++p) {
|
||||||
|
SyntaxTree &child = **p;
|
||||||
|
if (child.IsLeaf()) {
|
||||||
|
out_ << " " << Escape(child.value().label);
|
||||||
|
} else {
|
||||||
|
out_ << " ";
|
||||||
|
Write(child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Closing tag
|
||||||
|
out_ << " </tree>";
|
||||||
|
|
||||||
|
if (tree.parent() == 0) {
|
||||||
|
out_ << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Escapes XML special characters.
|
||||||
|
std::string XmlTreeWriter::Escape(const std::string &s) const {
|
||||||
|
if (!escape_) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
std::string t;
|
||||||
|
std::size_t len = s.size();
|
||||||
|
t.reserve(len);
|
||||||
|
for (std::size_t i = 0; i < len; ++i) {
|
||||||
|
if (s[i] == '<') {
|
||||||
|
t += "<";
|
||||||
|
} else if (s[i] == '>') {
|
||||||
|
t += ">";
|
||||||
|
} else if (s[i] == '[') {
|
||||||
|
t += "[";
|
||||||
|
} else if (s[i] == ']') {
|
||||||
|
t += "]";
|
||||||
|
} else if (s[i] == '|') {
|
||||||
|
t += "|";
|
||||||
|
} else if (s[i] == '&') {
|
||||||
|
t += "&";
|
||||||
|
} else if (s[i] == '\'') {
|
||||||
|
t += "'";
|
||||||
|
} else if (s[i] == '"') {
|
||||||
|
t += """;
|
||||||
|
} else {
|
||||||
|
t += s[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Syntax
|
||||||
|
} // namespace MosesTraining
|
27
phrase-extract/syntax-common/xml_tree_writer.h
Normal file
27
phrase-extract/syntax-common/xml_tree_writer.h
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <ostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
|
||||||
|
namespace MosesTraining {
|
||||||
|
namespace Syntax {
|
||||||
|
|
||||||
|
class XmlTreeWriter {
|
||||||
|
public:
|
||||||
|
XmlTreeWriter(std::ostream &out, bool escape=true)
|
||||||
|
: out_(out)
|
||||||
|
, escape_(escape) {}
|
||||||
|
|
||||||
|
void Write(const SyntaxTree &) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string Escape(const std::string &) const;
|
||||||
|
|
||||||
|
std::ostream &out_;
|
||||||
|
bool escape_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Syntax
|
||||||
|
} // namespace MosesTraining
|
Loading…
Reference in New Issue
Block a user