mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-08-16 15:00:33 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
0e11919ffb
@ -20,60 +20,23 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
namespace MosesTraining {
|
||||
|
||||
class SyntaxNode
|
||||
{
|
||||
protected:
|
||||
int m_start, m_end;
|
||||
std::string m_label;
|
||||
std::vector< SyntaxNode* > m_children;
|
||||
SyntaxNode* m_parent;
|
||||
float m_pcfgScore;
|
||||
public:
|
||||
struct SyntaxNode {
|
||||
typedef std::map<std::string, std::string> AttributeMap;
|
||||
|
||||
AttributeMap attributes;
|
||||
SyntaxNode(const std::string &label_, int start_, int end_)
|
||||
: label(label_)
|
||||
, start(start_)
|
||||
, end(end_) {
|
||||
}
|
||||
|
||||
SyntaxNode( int startPos, int endPos, std::string label )
|
||||
:m_start(startPos)
|
||||
,m_end(endPos)
|
||||
,m_label(label)
|
||||
,m_parent(0)
|
||||
,m_pcfgScore(0.0f) {
|
||||
}
|
||||
int GetStart() const {
|
||||
return m_start;
|
||||
}
|
||||
int GetEnd() const {
|
||||
return m_end;
|
||||
}
|
||||
std::string GetLabel() const {
|
||||
return m_label;
|
||||
}
|
||||
float GetPcfgScore() const {
|
||||
return m_pcfgScore;
|
||||
}
|
||||
void SetPcfgScore(float score) {
|
||||
m_pcfgScore = score;
|
||||
}
|
||||
SyntaxNode *GetParent() {
|
||||
return m_parent;
|
||||
}
|
||||
void SetParent(SyntaxNode *parent) {
|
||||
m_parent = parent;
|
||||
}
|
||||
void AddChild(SyntaxNode* child) {
|
||||
m_children.push_back(child);
|
||||
}
|
||||
const std::vector< SyntaxNode* > &GetChildren() const {
|
||||
return m_children;
|
||||
}
|
||||
std::string label;
|
||||
int start;
|
||||
int end;
|
||||
AttributeMap attributes;
|
||||
};
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection()
|
||||
|
||||
void SyntaxNodeCollection::Clear()
|
||||
{
|
||||
m_top = 0;
|
||||
// loop through all m_nodes, delete them
|
||||
for(size_t i=0; i<m_nodes.size(); i++) {
|
||||
delete m_nodes[i];
|
||||
@ -45,113 +44,32 @@ void SyntaxNodeCollection::Clear()
|
||||
SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
|
||||
const std::string &label)
|
||||
{
|
||||
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
|
||||
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
|
||||
m_nodes.push_back( newNode );
|
||||
m_index[ startPos ][ endPos ].push_back( newNode );
|
||||
m_size = std::max(endPos+1, m_size);
|
||||
m_numWords = std::max(endPos+1, m_numWords);
|
||||
return newNode;
|
||||
}
|
||||
|
||||
ParentNodes SyntaxNodeCollection::Parse()
|
||||
{
|
||||
ParentNodes parents;
|
||||
|
||||
// looping through all spans of size >= 2
|
||||
for( int length=2; length<=m_size; length++ ) {
|
||||
for( int startPos = 0; startPos <= m_size-length; startPos++ ) {
|
||||
if (HasNode( startPos, startPos+length-1 )) {
|
||||
// processing one (parent) span
|
||||
|
||||
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
|
||||
SplitPoints splitPoints;
|
||||
splitPoints.push_back( startPos );
|
||||
//std::cerr << " " << startPos;
|
||||
|
||||
int first = 1;
|
||||
int covered = 0;
|
||||
int found_somehing = 1; // break loop if nothing found
|
||||
while( covered < length && found_somehing ) {
|
||||
// find largest covering subspan (child)
|
||||
// starting at last covered position
|
||||
found_somehing = 0;
|
||||
for( int midPos=length-first; midPos>covered; midPos-- ) {
|
||||
if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
|
||||
covered = midPos;
|
||||
splitPoints.push_back( startPos+covered );
|
||||
// std::cerr << " " << ( startPos+covered );
|
||||
first = 0;
|
||||
found_somehing = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// std::cerr << std::endl;
|
||||
parents.push_back( splitPoints );
|
||||
}
|
||||
}
|
||||
}
|
||||
return parents;
|
||||
}
|
||||
|
||||
bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
|
||||
{
|
||||
return GetNodes( startPos, endPos).size() > 0;
|
||||
}
|
||||
|
||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
|
||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
|
||||
int startPos, int endPos ) const
|
||||
{
|
||||
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
|
||||
NodeIndex::const_iterator startIndex = m_index.find( startPos );
|
||||
if (startIndex == m_index.end() )
|
||||
return m_emptyNode;
|
||||
|
||||
SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
|
||||
InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
|
||||
if (endIndex == startIndex->second.end())
|
||||
return m_emptyNode;
|
||||
|
||||
return endIndex->second;
|
||||
}
|
||||
|
||||
void SyntaxNodeCollection::ConnectNodes()
|
||||
{
|
||||
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
|
||||
|
||||
SyntaxNode *prev = 0;
|
||||
// Iterate over all start indices from lowest to highest.
|
||||
for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
||||
const SyntaxTreeIndex2 &inner = p->second;
|
||||
// Iterate over all end indices from highest to lowest.
|
||||
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
|
||||
const std::vector<SyntaxNode*> &nodes = q->second;
|
||||
// Iterate over all nodes that cover the same span in order of tree
|
||||
// depth, top-most first.
|
||||
for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
|
||||
r != nodes.rend(); ++r) {
|
||||
SyntaxNode *node = *r;
|
||||
if (!prev) {
|
||||
// node is the root.
|
||||
m_top = node;
|
||||
node->SetParent(0);
|
||||
} else if (prev->GetStart() == node->GetStart()) {
|
||||
// prev is the parent of node.
|
||||
assert(prev->GetEnd() >= node->GetEnd());
|
||||
node->SetParent(prev);
|
||||
prev->AddChild(node);
|
||||
} else {
|
||||
// prev is a descendant of node's parent. The lowest common
|
||||
// ancestor of prev and node will be node's parent.
|
||||
SyntaxNode *ancestor = prev->GetParent();
|
||||
while (ancestor->GetEnd() < node->GetEnd()) {
|
||||
ancestor = ancestor->GetParent();
|
||||
}
|
||||
assert(ancestor);
|
||||
node->SetParent(ancestor);
|
||||
ancestor->AddChild(node);
|
||||
}
|
||||
prev = node;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
||||
{
|
||||
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
|
||||
@ -163,14 +81,15 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
||||
}
|
||||
|
||||
// Connect the SyntaxTrees.
|
||||
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
|
||||
typedef NodeIndex::const_iterator OuterIterator;
|
||||
typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
|
||||
|
||||
SyntaxTree *root = 0;
|
||||
SyntaxNode *prevNode = 0;
|
||||
SyntaxTree *prevTree = 0;
|
||||
// Iterate over all start indices from lowest to highest.
|
||||
for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
||||
const SyntaxTreeIndex2 &inner = p->second;
|
||||
for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
||||
const InnerNodeIndex &inner = p->second;
|
||||
// Iterate over all end indices from highest to lowest.
|
||||
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
|
||||
const std::vector<SyntaxNode*> &nodes = q->second;
|
||||
@ -184,16 +103,16 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
||||
// node is the root.
|
||||
root = tree;
|
||||
tree->parent() = 0;
|
||||
} else if (prevNode->GetStart() == node->GetStart()) {
|
||||
} else if (prevNode->start == node->start) {
|
||||
// prevNode is the parent of node.
|
||||
assert(prevNode->GetEnd() >= node->GetEnd());
|
||||
assert(prevNode->end >= node->end);
|
||||
tree->parent() = prevTree;
|
||||
prevTree->children().push_back(tree);
|
||||
} else {
|
||||
// prevNode is a descendant of node's parent. The lowest common
|
||||
// ancestor of prevNode and node will be node's parent.
|
||||
SyntaxTree *ancestor = prevTree->parent();
|
||||
while (ancestor->value().GetEnd() < tree->value().GetEnd()) {
|
||||
while (ancestor->value().end < tree->value().end) {
|
||||
ancestor = ancestor->parent();
|
||||
}
|
||||
assert(ancestor);
|
||||
|
@ -31,49 +31,47 @@
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
typedef std::vector< int > SplitPoints;
|
||||
typedef std::vector< SplitPoints > ParentNodes;
|
||||
|
||||
/** A collection of SyntaxNodes organized by start and end position.
|
||||
*
|
||||
*/
|
||||
class SyntaxNodeCollection
|
||||
{
|
||||
protected:
|
||||
std::vector< SyntaxNode* > m_nodes;
|
||||
SyntaxNode* m_top;
|
||||
|
||||
typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
|
||||
typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
|
||||
typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
|
||||
typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
|
||||
SyntaxTreeIndex m_index;
|
||||
int m_size;
|
||||
std::vector< SyntaxNode* > m_emptyNode;
|
||||
|
||||
public:
|
||||
SyntaxNodeCollection()
|
||||
: m_top(0) // m_top doesn't get set unless ConnectNodes is called.
|
||||
, m_size(0) {}
|
||||
SyntaxNodeCollection() : m_numWords(0) {}
|
||||
|
||||
~SyntaxNodeCollection();
|
||||
|
||||
//! Construct and insert a new SyntaxNode.
|
||||
SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
|
||||
|
||||
SyntaxNode *GetTop() {
|
||||
return m_top;
|
||||
}
|
||||
|
||||
ParentNodes Parse();
|
||||
//! Return true iff there are one or more SyntaxNodes with the given span.
|
||||
bool HasNode( int startPos, int endPos ) const;
|
||||
|
||||
//! Lookup the SyntaxNodes for a given span.
|
||||
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
|
||||
const std::vector< SyntaxNode* >& GetAllNodes() {
|
||||
return m_nodes;
|
||||
};
|
||||
|
||||
//! Get a vector of pointers to all SyntaxNodes (unordered).
|
||||
const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; };
|
||||
|
||||
size_t GetNumWords() const {
|
||||
return m_size;
|
||||
return m_numWords;
|
||||
}
|
||||
void ConnectNodes();
|
||||
void Clear();
|
||||
|
||||
std::auto_ptr<SyntaxTree> ExtractTree();
|
||||
|
||||
private:
|
||||
typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
|
||||
typedef std::map< int, InnerNodeIndex > NodeIndex;
|
||||
|
||||
// Not copyable.
|
||||
SyntaxNodeCollection(const SyntaxNodeCollection &);
|
||||
SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
|
||||
|
||||
std::vector< SyntaxNode* > m_nodes;
|
||||
NodeIndex m_index;
|
||||
int m_numWords;
|
||||
std::vector< SyntaxNode* > m_emptyNode;
|
||||
};
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
||||
string label = ParseXmlTagAttribute(tagContent,"label");
|
||||
labelCollection.insert( label );
|
||||
|
||||
string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
|
||||
float pcfgScore = pcfgString == "" ? 0.0f
|
||||
: std::atof(pcfgString.c_str());
|
||||
|
||||
// report what we have processed so far
|
||||
if (0) {
|
||||
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
|
||||
@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
||||
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
|
||||
}
|
||||
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
|
||||
node->SetPcfgScore(pcfgScore);
|
||||
ParseXmlTagAttributes(tagContent, node->attributes);
|
||||
}
|
||||
}
|
||||
@ -424,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
||||
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
|
||||
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
|
||||
SyntaxNode *n = *node;
|
||||
const string &label = n->GetLabel();
|
||||
const string &label = n->label;
|
||||
if (topLabelCollection.find( label ) == topLabelCollection.end())
|
||||
topLabelCollection[ label ] = 0;
|
||||
topLabelCollection[ label ]++;
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <stack>
|
||||
|
||||
@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
|
||||
{
|
||||
NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
|
||||
|
||||
std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
|
||||
std::auto_ptr<Node> n(new Node(root->value().label, nodeType));
|
||||
|
||||
if (nodeType == TREE) {
|
||||
float score = 0.0f;
|
||||
|
@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
|
||||
}
|
||||
|
||||
// Target label sets for producing glue grammar.
|
||||
std::set<std::string> targetLabelSet;
|
||||
std::map<std::string, int> targetTopLabelSet;
|
||||
|
||||
// Source label sets for producing glue grammar.
|
||||
std::set<std::string> sourceLabelSet;
|
||||
std::map<std::string, int> sourceTopLabelSet;
|
||||
|
||||
// Word count statistics for producing unknown word labels.
|
||||
std::map<std::string, int> targetWordCount;
|
||||
std::map<std::string, std::string> targetWordLabel;
|
||||
@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
std::string sourceLine;
|
||||
std::string alignmentLine;
|
||||
Alignment alignment;
|
||||
Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
|
||||
Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
|
||||
Syntax::XmlTreeParser targetXmlTreeParser;
|
||||
Syntax::XmlTreeParser sourceXmlTreeParser;
|
||||
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
|
||||
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
|
||||
size_t lineNum = options.sentenceOffset;
|
||||
@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
}
|
||||
Error(oss.str());
|
||||
}
|
||||
sourceTokens = sourceXmlTreeParser.GetWords();
|
||||
sourceTokens = sourceXmlTreeParser.words();
|
||||
}
|
||||
|
||||
// Read word alignments.
|
||||
@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
|
||||
// Initialize phrase orientation scoring object
|
||||
PhraseOrientation phraseOrientation(sourceTokens.size(),
|
||||
targetXmlTreeParser.GetWords().size(), alignment);
|
||||
targetXmlTreeParser.words().size(), alignment);
|
||||
|
||||
// Write the rules, subject to scope pruning.
|
||||
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
|
||||
@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
// SCFG output.
|
||||
ScfgRule *r = 0;
|
||||
if (options.sourceLabels) {
|
||||
r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection());
|
||||
r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
|
||||
} else {
|
||||
r = new ScfgRule(**q);
|
||||
}
|
||||
@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
|
||||
std::map<std::string,size_t> sourceLabels;
|
||||
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
|
||||
|
||||
sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
|
||||
sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
|
||||
sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar
|
||||
sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
|
||||
std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
|
||||
extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
|
||||
extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
|
||||
extendedLabelSet.insert("TOPLABEL"); // as used in the glue grammar
|
||||
extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
|
||||
size_t index = 0;
|
||||
for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
|
||||
iter!=sourceLabelSet.end(); ++iter, ++index) {
|
||||
for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
|
||||
iter!=extendedLabelSet.end(); ++iter, ++index) {
|
||||
sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
|
||||
}
|
||||
WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
|
||||
@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
std::map<std::string, int> strippedTargetTopLabelSet;
|
||||
if (options.stripBitParLabels &&
|
||||
(!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
|
||||
StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet);
|
||||
StripBitParLabels(targetXmlTreeParser.label_set(),
|
||||
targetXmlTreeParser.top_label_set(),
|
||||
strippedTargetLabelSet, strippedTargetTopLabelSet);
|
||||
}
|
||||
|
||||
if (!options.glueGrammarFile.empty()) {
|
||||
if (options.stripBitParLabels) {
|
||||
WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
|
||||
} else {
|
||||
WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
|
||||
WriteGlueGrammar(targetXmlTreeParser.label_set(),
|
||||
targetXmlTreeParser.top_label_set(),
|
||||
sourceLabels, options, glueGrammarStream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
if (options.stripBitParLabels) {
|
||||
WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
|
||||
} else {
|
||||
WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
|
||||
WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
|
||||
unknownWordSoftMatchesStream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -816,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts(
|
||||
for (SyntaxTree::ConstLeafIterator p(root);
|
||||
p != SyntaxTree::ConstLeafIterator(); ++p) {
|
||||
const SyntaxTree &leaf = *p;
|
||||
const std::string &word = leaf.value().GetLabel();
|
||||
const std::string &word = leaf.value().label;
|
||||
const SyntaxTree *ancestor = leaf.parent();
|
||||
// If unary rule elimination is enabled and this word is at the end of a
|
||||
// chain of unary rewrites, e.g.
|
||||
@ -828,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts(
|
||||
ancestor->parent()->children().size() == 1) {
|
||||
ancestor = ancestor->parent();
|
||||
}
|
||||
const std::string &label = ancestor->value().GetLabel();
|
||||
const std::string &label = ancestor->value().label;
|
||||
++wordCount[word];
|
||||
wordLabel[word] = label;
|
||||
}
|
||||
@ -840,7 +837,7 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
|
||||
for (SyntaxTree::ConstLeafIterator p(root);
|
||||
p != SyntaxTree::ConstLeafIterator(); ++p) {
|
||||
const SyntaxTree &leaf = *p;
|
||||
const std::string &word = leaf.value().GetLabel();
|
||||
const std::string &word = leaf.value().label;
|
||||
tokens.push_back(word);
|
||||
}
|
||||
return tokens;
|
||||
|
@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
|
||||
sourceNodeCollection->GetNodes(span.first,span.second);
|
||||
if (!sourceLabels.empty()) {
|
||||
// store the topmost matching label from the source syntax tree
|
||||
m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
|
||||
m_sourceLabels.push_back(sourceLabels.back()->label);
|
||||
}
|
||||
} else {
|
||||
// no matching source-side syntactic constituent: store nonMatchingLabel
|
||||
|
@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
|
||||
void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
|
||||
void writeUnknownWordLabel(const string &);
|
||||
|
||||
double getPcfgScore(const SyntaxNode &);
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
@ -505,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
|
||||
|
||||
int labelI = labelIndex[ 2+holeCount+holeTotal ];
|
||||
string label = m_options.sourceSyntax ?
|
||||
m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
|
||||
m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
|
||||
hole.SetLabel(label, 0);
|
||||
|
||||
currPos = hole.GetEnd(0);
|
||||
@ -548,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
|
||||
int labelI = labelIndex[ 2+holeCount ];
|
||||
string targetLabel;
|
||||
if (m_options.targetSyntax) {
|
||||
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
|
||||
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
targetLabel = "S";
|
||||
} else {
|
||||
@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
|
||||
}
|
||||
|
||||
if (m_options.pcfgScore) {
|
||||
double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
|
||||
logPCFGScore -= score;
|
||||
logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
|
||||
}
|
||||
|
||||
currPos = hole.GetEnd(1);
|
||||
@ -674,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
||||
// phrase labels
|
||||
string targetLabel;
|
||||
if (m_options.targetSyntax) {
|
||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
|
||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
targetLabel = "S";
|
||||
} else {
|
||||
@ -682,14 +683,14 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
||||
}
|
||||
|
||||
string sourceLabel = m_options.sourceSyntax ?
|
||||
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
|
||||
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
|
||||
|
||||
// create non-terms on the source side
|
||||
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
|
||||
|
||||
// target
|
||||
if (m_options.pcfgScore) {
|
||||
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
|
||||
double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
|
||||
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
|
||||
+ " [" + targetLabel + "]";
|
||||
rule.pcfgScore = std::exp(logPCFGScore);
|
||||
@ -946,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
|
||||
// phrase labels
|
||||
string targetLabel,sourceLabel;
|
||||
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
|
||||
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
|
||||
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||
} else {
|
||||
sourceLabel = m_options.sourceSyntax ?
|
||||
m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
|
||||
m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
|
||||
|
||||
if (m_options.targetSyntax) {
|
||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
|
||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
targetLabel = "S";
|
||||
} else {
|
||||
@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
|
||||
rule.target += "[" + targetLabel + "]";
|
||||
|
||||
if (m_options.pcfgScore) {
|
||||
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
|
||||
double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
|
||||
rule.pcfgScore = std::exp(logPCFGScore);
|
||||
}
|
||||
|
||||
@ -1165,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
|
||||
const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
|
||||
if (labels.size() > 0) {
|
||||
wordCount[ word ]++;
|
||||
wordLabel[ word ] = labels[0]->GetLabel();
|
||||
wordLabel[ word ] = labels[0]->label;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName)
|
||||
|
||||
outFile.close();
|
||||
}
|
||||
|
||||
double getPcfgScore(const SyntaxNode &node)
|
||||
{
|
||||
double score = 0.0f;
|
||||
SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
|
||||
if (p != node.attributes.end()) {
|
||||
score = std::atof(p->second.c_str());
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet(
|
||||
void FilterRuleTable::ReadTestSet(
|
||||
std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
|
||||
{
|
||||
std::set<std::string> labelSet;
|
||||
std::map<std::string, int> topLabelSet;
|
||||
XmlTreeParser parser(labelSet, topLabelSet);
|
||||
XmlTreeParser parser;
|
||||
int lineNum = 0;
|
||||
std::string line;
|
||||
while (std::getline(input, line)) {
|
||||
|
@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter(
|
||||
|
||||
TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
|
||||
{
|
||||
IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
|
||||
IdTree *t = new IdTree(m_testVocab.Insert(s.value().label));
|
||||
const std::vector<SyntaxTree*> &sChildren = s.children();
|
||||
std::vector<IdTree*> &tChildren = t->children();
|
||||
tChildren.reserve(sChildren.size());
|
||||
|
@ -1,79 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2012 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef PCFG_PCFG_TREE_H_
|
||||
#define PCFG_PCFG_TREE_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "syntax_tree.h"
|
||||
#include "xml_tree_writer.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
namespace PCFG {
|
||||
|
||||
template<typename DerivedType>
|
||||
class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
|
||||
public:
|
||||
typedef std::string LabelType;
|
||||
typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
|
||||
|
||||
PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
|
||||
|
||||
double score() const { return score_; }
|
||||
void set_score(double s) { score_ = s; }
|
||||
|
||||
private:
|
||||
double score_;
|
||||
};
|
||||
|
||||
class PcfgTree : public PcfgTreeBase<PcfgTree> {
|
||||
public:
|
||||
typedef PcfgTreeBase<PcfgTree> BaseType;
|
||||
PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
|
||||
};
|
||||
|
||||
// Specialise XmlOutputHandler for PcfgTree.
|
||||
template<>
|
||||
class XmlOutputHandler<PcfgTree> {
|
||||
public:
|
||||
typedef std::map<std::string, std::string> AttributeMap;
|
||||
|
||||
void GetLabel(const PcfgTree &tree, std::string &label) const {
|
||||
label = tree.label();
|
||||
}
|
||||
|
||||
void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
|
||||
attribute_map.clear();
|
||||
double score = tree.score();
|
||||
if (score != 0.0) {
|
||||
std::ostringstream out;
|
||||
out << tree.score();
|
||||
attribute_map["pcfg"] = out.str();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace PCFG
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
||||
|
||||
#endif
|
@ -1,93 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2012 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef PCFG_SYNTAX_TREE_H_
|
||||
#define PCFG_SYNTAX_TREE_H_
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
namespace PCFG {
|
||||
|
||||
// Base class for SyntaxTree, AgreementTree, and friends.
|
||||
template<typename T, typename DerivedType>
|
||||
class SyntaxTreeBase {
|
||||
public:
|
||||
// Constructors
|
||||
SyntaxTreeBase(const T &label)
|
||||
: label_(label)
|
||||
, children_()
|
||||
, parent_(0) {}
|
||||
|
||||
SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
|
||||
: label_(label)
|
||||
, children_(children)
|
||||
, parent_(0) {}
|
||||
|
||||
// Destructor
|
||||
virtual ~SyntaxTreeBase();
|
||||
|
||||
const T &label() const { return label_; }
|
||||
const DerivedType *parent() const { return parent_; }
|
||||
DerivedType *parent() { return parent_; }
|
||||
const std::vector<DerivedType *> &children() const { return children_; }
|
||||
std::vector<DerivedType *> &children() { return children_; }
|
||||
|
||||
void set_label(const T &label) { label_ = label; }
|
||||
void set_parent(DerivedType *parent) { parent_ = parent; }
|
||||
void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
|
||||
|
||||
bool IsLeaf() const { return children_.empty(); }
|
||||
|
||||
bool IsPreterminal() const {
|
||||
return children_.size() == 1 && children_[0]->IsLeaf();
|
||||
}
|
||||
|
||||
void AddChild(DerivedType *child) { children_.push_back(child); }
|
||||
|
||||
private:
|
||||
T label_;
|
||||
std::vector<DerivedType *> children_;
|
||||
DerivedType *parent_;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
|
||||
public:
|
||||
typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
|
||||
SyntaxTree(const T &label) : BaseType(label) {}
|
||||
SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
|
||||
: BaseType(label, children) {}
|
||||
};
|
||||
|
||||
template<typename T, typename DerivedType>
|
||||
SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
|
||||
for (std::size_t i = 0; i < children_.size(); ++i) {
|
||||
delete children_[i];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace PCFG
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
||||
|
||||
#endif
|
@ -24,7 +24,6 @@
|
||||
#include <string>
|
||||
|
||||
#include "syntax-common/numbered_set.h"
|
||||
#include "syntax_tree.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
|
@ -1,89 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2012 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include "xml_tree_parser.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include "syntax-common/exception.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
namespace PCFG {
|
||||
|
||||
XmlTreeParser::XmlTreeParser() {
|
||||
}
|
||||
|
||||
std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
|
||||
m_line = line;
|
||||
m_tree.Clear();
|
||||
try {
|
||||
if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
|
||||
throw Exception("");
|
||||
}
|
||||
} catch (const XmlException &e) {
|
||||
throw Exception(e.getMsg());
|
||||
}
|
||||
m_tree.ConnectNodes();
|
||||
SyntaxNode *root = m_tree.GetTop();
|
||||
if (!root) {
|
||||
// There is no XML tree.
|
||||
return std::auto_ptr<PcfgTree>();
|
||||
}
|
||||
m_words = util::tokenize(m_line);
|
||||
return ConvertTree(*root, m_words);
|
||||
}
|
||||
|
||||
// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
|
||||
std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
|
||||
const SyntaxNode &tree,
|
||||
const std::vector<std::string> &words) {
|
||||
std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
|
||||
const std::vector<SyntaxNode*> &children = tree.GetChildren();
|
||||
if (children.empty()) {
|
||||
if (tree.GetStart() != tree.GetEnd()) {
|
||||
std::ostringstream msg;
|
||||
msg << "leaf node covers multiple words (" << tree.GetStart()
|
||||
<< "-" << tree.GetEnd() << "): this is currently unsupported";
|
||||
throw Exception(msg.str());
|
||||
}
|
||||
std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
|
||||
leaf->set_parent(root.get());
|
||||
root->AddChild(leaf.release());
|
||||
} else {
|
||||
for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
|
||||
p != children.end(); ++p) {
|
||||
assert(*p);
|
||||
std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
|
||||
child->set_parent(root.get());
|
||||
root->AddChild(child.release());
|
||||
}
|
||||
}
|
||||
return root;
|
||||
}
|
||||
|
||||
} // namespace PCFG
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
@ -1,59 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2012 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef PCFG_XML_TREE_PARSER_H_
|
||||
#define PCFG_XML_TREE_PARSER_H_
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "pcfg_tree.h"
|
||||
#include "SyntaxNode.h"
|
||||
#include "SyntaxNodeCollection.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
namespace PCFG {
|
||||
|
||||
// Parses a string in Moses' XML parse tree format and returns a PcfgTree
|
||||
// object.
|
||||
class XmlTreeParser {
|
||||
public:
|
||||
XmlTreeParser();
|
||||
std::auto_ptr<PcfgTree> Parse(const std::string &);
|
||||
private:
|
||||
std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
|
||||
const std::vector<std::string> &);
|
||||
|
||||
std::set<std::string> m_labelSet;
|
||||
std::map<std::string, int> m_topLabelSet;
|
||||
std::string m_line;
|
||||
MosesTraining::SyntaxNodeCollection m_tree;
|
||||
std::vector<std::string> m_words;
|
||||
};
|
||||
|
||||
} // namespace PCFG
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
||||
|
||||
#endif
|
@ -1,135 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2012 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef PCFG_XML_TREE_WRITER_H_
|
||||
#define PCFG_XML_TREE_WRITER_H_
|
||||
|
||||
#include <cassert>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "XmlTree.h"
|
||||
|
||||
#include "syntax_tree.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
namespace PCFG {
|
||||
|
||||
template<typename InputTree>
|
||||
class XmlOutputHandler {
|
||||
public:
|
||||
typedef std::map<std::string, std::string> AttributeMap;
|
||||
|
||||
void GetLabel(const InputTree &, std::string &) const;
|
||||
void GetAttributes(const InputTree &, AttributeMap &) const;
|
||||
};
|
||||
|
||||
template<typename InputTree>
|
||||
class XmlTreeWriter : public XmlOutputHandler<InputTree> {
|
||||
public:
|
||||
typedef XmlOutputHandler<InputTree> Base;
|
||||
void Write(const InputTree &, std::ostream &) const;
|
||||
private:
|
||||
std::string Escape(const std::string &) const;
|
||||
};
|
||||
|
||||
template<typename InputTree>
|
||||
void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
|
||||
std::ostream &out) const {
|
||||
assert(!tree.IsLeaf());
|
||||
|
||||
// Opening tag
|
||||
|
||||
std::string label;
|
||||
Base::GetLabel(tree, label);
|
||||
out << "<tree label=\"" << Escape(label) << "\"";
|
||||
|
||||
typename Base::AttributeMap attribute_map;
|
||||
Base::GetAttributes(tree, attribute_map);
|
||||
|
||||
for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
|
||||
p != attribute_map.end(); ++p) {
|
||||
out << " " << p->first << "=\"" << p->second << "\"";
|
||||
}
|
||||
|
||||
out << ">";
|
||||
|
||||
// Children
|
||||
|
||||
const std::vector<InputTree *> &children = tree.children();
|
||||
for (typename std::vector<InputTree *>::const_iterator p = children.begin();
|
||||
p != children.end(); ++p) {
|
||||
InputTree &child = **p;
|
||||
if (child.IsLeaf()) {
|
||||
Base::GetLabel(child, label);
|
||||
out << " " << Escape(label);
|
||||
} else {
|
||||
out << " ";
|
||||
Write(**p, out);
|
||||
}
|
||||
}
|
||||
|
||||
// Closing tag
|
||||
out << " </tree>";
|
||||
|
||||
if (tree.parent() == 0) {
|
||||
out << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Escapes XML special characters.
|
||||
template<typename InputTree>
|
||||
std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
|
||||
std::string t;
|
||||
std::size_t len = s.size();
|
||||
t.reserve(len);
|
||||
for (std::size_t i = 0; i < len; ++i) {
|
||||
if (s[i] == '<') {
|
||||
t += "<";
|
||||
} else if (s[i] == '>') {
|
||||
t += ">";
|
||||
} else if (s[i] == '[') {
|
||||
t += "[";
|
||||
} else if (s[i] == ']') {
|
||||
t += "]";
|
||||
} else if (s[i] == '|') {
|
||||
t += "|";
|
||||
} else if (s[i] == '&') {
|
||||
t += "&";
|
||||
} else if (s[i] == '\'') {
|
||||
t += "'";
|
||||
} else if (s[i] == '"') {
|
||||
t += """;
|
||||
} else {
|
||||
t += s[i];
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
} // namespace PCFG
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
||||
|
||||
#endif
|
@ -1 +1 @@
|
||||
exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
|
||||
exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : <include>.. ;
|
||||
|
@ -19,20 +19,6 @@
|
||||
|
||||
#include "pcfg_extract.h"
|
||||
|
||||
#include "options.h"
|
||||
#include "rule_collection.h"
|
||||
#include "rule_extractor.h"
|
||||
|
||||
#include "syntax-common/exception.h"
|
||||
|
||||
#include "pcfg-common/pcfg.h"
|
||||
#include "pcfg-common/pcfg_tree.h"
|
||||
#include "pcfg-common/syntax_tree.h"
|
||||
#include "pcfg-common/typedef.h"
|
||||
#include "pcfg-common/xml_tree_parser.h"
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
@ -43,6 +29,20 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include "syntax-common/exception.h"
|
||||
#include "syntax-common/xml_tree_parser.h"
|
||||
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include "pcfg-common/pcfg.h"
|
||||
#include "pcfg-common/typedef.h"
|
||||
|
||||
#include "options.h"
|
||||
#include "rule_collection.h"
|
||||
#include "rule_extractor.h"
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
namespace Syntax
|
||||
@ -63,7 +63,7 @@ int PcfgExtract::Main(int argc, char *argv[])
|
||||
XmlTreeParser parser;
|
||||
std::string line;
|
||||
std::size_t line_num = 0;
|
||||
std::auto_ptr<PcfgTree> tree;
|
||||
std::auto_ptr<MosesTraining::SyntaxTree> tree;
|
||||
while (std::getline(std::cin, line)) {
|
||||
++line_num;
|
||||
try {
|
||||
|
@ -19,8 +19,6 @@
|
||||
|
||||
#include "rule_extractor.h"
|
||||
|
||||
#include "pcfg-common/pcfg_tree.h"
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
namespace Syntax
|
||||
@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
|
||||
{
|
||||
}
|
||||
|
||||
void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const
|
||||
void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
|
||||
{
|
||||
if (tree.IsPreterminal() || tree.IsLeaf()) {
|
||||
if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::size_t lhs = non_term_vocab_.Insert(tree.label());
|
||||
std::size_t lhs = non_term_vocab_.Insert(tree.value().label);
|
||||
std::vector<std::size_t> rhs;
|
||||
|
||||
const std::vector<PcfgTree *> &children = tree.children();
|
||||
const std::vector<SyntaxTree *> &children = tree.children();
|
||||
rhs.reserve(children.size());
|
||||
for (std::vector<PcfgTree *>::const_iterator p(children.begin());
|
||||
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
|
||||
p != children.end(); ++p) {
|
||||
const PcfgTree &child = **p;
|
||||
rhs.push_back(non_term_vocab_.Insert(child.label()));
|
||||
const SyntaxTree &child = **p;
|
||||
rhs.push_back(non_term_vocab_.Insert(child.value().label));
|
||||
Extract(child, rc);
|
||||
}
|
||||
rc.Add(lhs, rhs);
|
||||
|
@ -21,6 +21,8 @@
|
||||
#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
|
||||
#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
|
||||
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include "pcfg-common/typedef.h"
|
||||
|
||||
#include "rule_collection.h"
|
||||
@ -32,14 +34,12 @@ namespace Syntax
|
||||
namespace PCFG
|
||||
{
|
||||
|
||||
class PcfgTree;
|
||||
|
||||
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
|
||||
class RuleExtractor
|
||||
{
|
||||
public:
|
||||
RuleExtractor(Vocabulary &);
|
||||
void Extract(const PcfgTree &, RuleCollection &) const;
|
||||
void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const;
|
||||
private:
|
||||
Vocabulary &non_term_vocab_;
|
||||
};
|
||||
|
@ -33,13 +33,14 @@
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include "syntax-common/exception.h"
|
||||
#include "syntax-common/xml_tree_parser.h"
|
||||
#include "syntax-common/xml_tree_writer.h"
|
||||
|
||||
#include "pcfg-common/pcfg.h"
|
||||
#include "pcfg-common/pcfg_tree.h"
|
||||
#include "pcfg-common/syntax_tree.h"
|
||||
#include "pcfg-common/typedef.h"
|
||||
#include "pcfg-common/xml_tree_parser.h"
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
@ -66,14 +67,14 @@ int PcfgScore::Main(int argc, char *argv[])
|
||||
// Score corpus according to PCFG.
|
||||
TreeScorer scorer(pcfg, non_term_vocab);
|
||||
XmlTreeParser parser;
|
||||
XmlTreeWriter<PcfgTree> writer;
|
||||
XmlTreeWriter writer(std::cout);
|
||||
std::string line;
|
||||
std::size_t line_num = 0;
|
||||
std::auto_ptr<PcfgTree> tree;
|
||||
std::auto_ptr<SyntaxTree> tree;
|
||||
while (std::getline(std::cin, line)) {
|
||||
++line_num;
|
||||
try {
|
||||
tree = parser.Parse(line);
|
||||
tree = parser.Parse(line, true);
|
||||
} catch (Exception &e) {
|
||||
std::ostringstream msg;
|
||||
msg << "line " << line_num << ": " << e.msg();
|
||||
@ -93,7 +94,7 @@ int PcfgScore::Main(int argc, char *argv[])
|
||||
std::cout << line << std::endl;
|
||||
continue;
|
||||
}
|
||||
writer.Write(*tree, std::cout);
|
||||
writer.Write(*tree);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "tree_scorer.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
|
||||
{
|
||||
}
|
||||
|
||||
bool TreeScorer::Score(PcfgTree &root) const
|
||||
bool TreeScorer::Score(SyntaxTree &root)
|
||||
{
|
||||
if (root.IsPreterminal() || root.IsLeaf()) {
|
||||
scores_.clear();
|
||||
ZeroScores(root);
|
||||
if (!CalcScores(root)) {
|
||||
return false;
|
||||
}
|
||||
SetAttributes(root);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TreeScorer::CalcScores(SyntaxTree &root)
|
||||
{
|
||||
if (root.IsLeaf() || root.children()[0]->IsLeaf()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::vector<PcfgTree *> &children = root.children();
|
||||
const std::vector<SyntaxTree *> &children = root.children();
|
||||
|
||||
double log_prob = 0.0;
|
||||
|
||||
std::vector<std::size_t> key;
|
||||
key.reserve(children.size()+1);
|
||||
key.push_back(non_term_vocab_.Lookup(root.label()));
|
||||
key.push_back(non_term_vocab_.Lookup(root.value().label));
|
||||
|
||||
for (std::vector<PcfgTree *>::const_iterator p(children.begin());
|
||||
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
|
||||
p != children.end(); ++p) {
|
||||
PcfgTree *child = *p;
|
||||
SyntaxTree *child = *p;
|
||||
assert(!child->IsLeaf());
|
||||
key.push_back(non_term_vocab_.Lookup(child->label()));
|
||||
if (!Score(*child)) {
|
||||
key.push_back(non_term_vocab_.Lookup(child->value().label));
|
||||
if (!CalcScores(*child)) {
|
||||
return false;
|
||||
}
|
||||
if (!child->IsPreterminal()) {
|
||||
log_prob += child->score();
|
||||
if (!child->children()[0]->IsLeaf()) {
|
||||
log_prob += scores_[child];
|
||||
}
|
||||
}
|
||||
double rule_score;
|
||||
@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const
|
||||
return false;
|
||||
}
|
||||
log_prob += rule_score;
|
||||
root.set_score(log_prob);
|
||||
scores_[&root] = log_prob;
|
||||
return true;
|
||||
}
|
||||
|
||||
void TreeScorer::SetAttributes(SyntaxTree &root)
|
||||
{
|
||||
// Terminals don't need attributes.
|
||||
if (root.IsLeaf()) {
|
||||
return;
|
||||
}
|
||||
// Preterminals don't need attributes (they have the implicit score 0.0).
|
||||
if (root.children()[0]->IsLeaf()) {
|
||||
return;
|
||||
}
|
||||
double score = scores_[&root];
|
||||
if (score != 0.0) {
|
||||
std::ostringstream out;
|
||||
out << score;
|
||||
root.value().attributes["pcfg"] = out.str();
|
||||
}
|
||||
for (std::vector<SyntaxTree *>::const_iterator p(root.children().begin());
|
||||
p != root.children().end(); ++p) {
|
||||
SetAttributes(**p);
|
||||
}
|
||||
}
|
||||
|
||||
void TreeScorer::ZeroScores(SyntaxTree &root)
|
||||
{
|
||||
scores_[&root] = 0.0f;
|
||||
const std::vector<SyntaxTree *> &children = root.children();
|
||||
for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
|
||||
p != children.end(); ++p) {
|
||||
ZeroScores(**p);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace PCFG
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
||||
|
@ -21,8 +21,9 @@
|
||||
#ifndef PCFG_SCORE_TREE_SCORER_H_
|
||||
#define PCFG_SCORE_TREE_SCORER_H_
|
||||
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include "pcfg-common/pcfg.h"
|
||||
#include "pcfg-common/pcfg_tree.h"
|
||||
#include "pcfg-common/typedef.h"
|
||||
|
||||
namespace MosesTraining
|
||||
@ -39,11 +40,16 @@ public:
|
||||
|
||||
// Score tree according to PCFG. Returns false if unsuccessful (due to
|
||||
// missing rule).
|
||||
bool Score(PcfgTree &) const;
|
||||
bool Score(SyntaxTree &);
|
||||
|
||||
private:
|
||||
const Pcfg &pcfg_;
|
||||
const Vocabulary &non_term_vocab_;
|
||||
std::map<SyntaxTree *, double> scores_;
|
||||
|
||||
bool CalcScores(SyntaxTree &);
|
||||
void SetAttributes(SyntaxTree &);
|
||||
void ZeroScores(SyntaxTree &);
|
||||
};
|
||||
|
||||
} // namespace PCFG
|
||||
|
@ -50,7 +50,7 @@ int main(int argc, char* argv[])
|
||||
// output tree
|
||||
// cerr << "BEFORE:" << endl << tree;
|
||||
|
||||
ParentNodes parents = tree.Parse();
|
||||
ParentNodes parents = determineSplitPoints(tree);
|
||||
|
||||
// execute selected grammar relaxation schemes
|
||||
if (leftBinarizeFlag)
|
||||
@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
|
||||
// output tree nodes
|
||||
vector< SyntaxNode* > nodes = tree.GetAllNodes();
|
||||
for( size_t i=0; i<nodes.size(); i++ ) {
|
||||
cout << " <tree span=\"" << nodes[i]->GetStart()
|
||||
<< "-" << nodes[i]->GetEnd()
|
||||
<< "\" label=\"" << nodes[i]->GetLabel()
|
||||
cout << " <tree span=\"" << nodes[i]->start
|
||||
<< "-" << nodes[i]->end
|
||||
<< "\" label=\"" << nodes[i]->label
|
||||
<< "\"/>";
|
||||
}
|
||||
cout << endl;
|
||||
@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
if (point.size() > 3) {
|
||||
const vector< SyntaxNode* >& topNodes
|
||||
= tree.GetNodes( point[0], point[point.size()-1]-1);
|
||||
string topLabel = topNodes[0]->GetLabel();
|
||||
string topLabel = topNodes[0]->label;
|
||||
|
||||
for(size_t i=2; i<point.size()-1; i++) {
|
||||
// cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
|
||||
@ -151,7 +151,7 @@ void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
int endPoint = point[point.size()-1]-1;
|
||||
const vector< SyntaxNode* >& topNodes
|
||||
= tree.GetNodes( point[0], endPoint);
|
||||
string topLabel = topNodes[0]->GetLabel();
|
||||
string topLabel = topNodes[0]->label;
|
||||
|
||||
for(size_t i=1; i<point.size()-2; i++) {
|
||||
// cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
|
||||
@ -178,29 +178,29 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
// cerr << endl;
|
||||
|
||||
for(size_t i = 0; i+2 < point.size(); i++) {
|
||||
// cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
|
||||
// cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl;
|
||||
|
||||
newTree.AddNode( point[i],point[i+2]-1,
|
||||
tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel()
|
||||
tree.GetNodes(point[i ],point[i+1]-1)[0]->label
|
||||
+ "+" +
|
||||
tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
|
||||
tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
|
||||
}
|
||||
}
|
||||
if (point.size() >= 4) {
|
||||
int ps = point.size();
|
||||
string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
|
||||
string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
|
||||
|
||||
// cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
|
||||
// cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl;
|
||||
newTree.AddNode( point[1],point[ps-1]-1,
|
||||
topLabel
|
||||
+ "\\" +
|
||||
tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
|
||||
tree.GetNodes(point[0],point[1]-1)[0]->label );
|
||||
|
||||
// cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
|
||||
// cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl;
|
||||
newTree.AddNode( point[0],point[ps-2]-1,
|
||||
topLabel
|
||||
+ "/" +
|
||||
tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
|
||||
tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
|
||||
}
|
||||
}
|
||||
|
||||
@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
|
||||
for(int mid=start+1; mid<=end && !done; mid++) {
|
||||
if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
|
||||
// cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl;
|
||||
// cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid, end )[0]->label << endl;
|
||||
|
||||
newTree.AddNode( start, end,
|
||||
tree.GetNodes(start,mid-1)[0]->GetLabel()
|
||||
tree.GetNodes(start,mid-1)[0]->label
|
||||
+ "++" +
|
||||
tree.GetNodes(mid, end )[0]->GetLabel() );
|
||||
tree.GetNodes(mid, end )[0]->label );
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
|
||||
if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
|
||||
newTree.AddNode( start, end,
|
||||
tree.GetNodes(start,postEnd)[0]->GetLabel()
|
||||
tree.GetNodes(start,postEnd)[0]->label
|
||||
+ "//" +
|
||||
tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
|
||||
tree.GetNodes(end+1,postEnd)[0]->label );
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
// if matching a constituent A left-minus constituent B: use A\\B
|
||||
for(int preStart=start-1; preStart>=0; preStart--) {
|
||||
if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
|
||||
// cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
|
||||
// cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->label << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->label << endl;
|
||||
newTree.AddNode( start, end,
|
||||
tree.GetNodes(preStart,end )[0]->GetLabel()
|
||||
tree.GetNodes(preStart,end )[0]->label
|
||||
+ "\\\\" +
|
||||
tree.GetNodes(preStart,start-1)[0]->GetLabel() );
|
||||
tree.GetNodes(preStart,start-1)[0]->label );
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
@ -268,6 +268,48 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
|
||||
// adding all new nodes
|
||||
vector< SyntaxNode* > nodes = newTree.GetAllNodes();
|
||||
for( size_t i=0; i<nodes.size(); i++ ) {
|
||||
tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
|
||||
tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
|
||||
}
|
||||
}
|
||||
|
||||
ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
|
||||
{
|
||||
ParentNodes parents;
|
||||
|
||||
const std::size_t numWords = nodeColl.GetNumWords();
|
||||
|
||||
// looping through all spans of size >= 2
|
||||
for( int length=2; length<=numWords; length++ ) {
|
||||
for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
|
||||
if (nodeColl.HasNode( startPos, startPos+length-1 )) {
|
||||
// processing one (parent) span
|
||||
|
||||
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
|
||||
SplitPoints splitPoints;
|
||||
splitPoints.push_back( startPos );
|
||||
//std::cerr << " " << startPos;
|
||||
|
||||
int first = 1;
|
||||
int covered = 0;
|
||||
int found_somehing = 1; // break loop if nothing found
|
||||
while( covered < length && found_somehing ) {
|
||||
// find largest covering subspan (child)
|
||||
// starting at last covered position
|
||||
found_somehing = 0;
|
||||
for( int midPos=length-first; midPos>covered; midPos-- ) {
|
||||
if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
|
||||
covered = midPos;
|
||||
splitPoints.push_back( startPos+covered );
|
||||
// std::cerr << " " << ( startPos+covered );
|
||||
first = 0;
|
||||
found_somehing = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// std::cerr << std::endl;
|
||||
parents.push_back( splitPoints );
|
||||
}
|
||||
}
|
||||
}
|
||||
return parents;
|
||||
}
|
||||
|
@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
|
||||
bool rightBinarizeFlag = false;
|
||||
char SAMTLevel = 0;
|
||||
|
||||
typedef std::vector< int > SplitPoints;
|
||||
typedef std::vector< SplitPoints > ParentNodes;
|
||||
|
||||
// functions
|
||||
void init(int argc, char* argv[]);
|
||||
ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
|
||||
void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
|
||||
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
|
||||
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
|
||||
void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
|
||||
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
|
||||
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
|
||||
void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
|
||||
|
||||
|
@ -10,30 +10,26 @@
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
|
||||
#include "exception.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
|
||||
XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
|
||||
std::map<std::string, int> &topLabelSet)
|
||||
: label_set_(labelSet)
|
||||
, top_label_set_(topLabelSet)
|
||||
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
|
||||
bool unescape)
|
||||
{
|
||||
}
|
||||
|
||||
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
|
||||
{
|
||||
line_ = line;
|
||||
sentence_ = line;
|
||||
node_collection_.Clear();
|
||||
try {
|
||||
if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
|
||||
top_label_set_, false)) {
|
||||
if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_,
|
||||
top_label_set_, unescape)) {
|
||||
throw Exception("");
|
||||
}
|
||||
} catch (const XmlException &e) {
|
||||
throw Exception(e.getMsg());
|
||||
}
|
||||
std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
|
||||
words_ = util::tokenize(line_);
|
||||
words_ = util::tokenize(sentence_);
|
||||
AttachWords(words_, *root);
|
||||
return root;
|
||||
}
|
||||
@ -51,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
|
||||
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
|
||||
++p) {
|
||||
SyntaxTree *leaf = *p;
|
||||
const int start = leaf->value().GetStart();
|
||||
const int end = leaf->value().GetEnd();
|
||||
const int start = leaf->value().start;
|
||||
const int end = leaf->value().end;
|
||||
if (start != end) {
|
||||
std::ostringstream msg;
|
||||
msg << "leaf node covers multiple words (" << start << "-" << end
|
||||
<< "): this is currently unsupported";
|
||||
throw Exception(msg.str());
|
||||
}
|
||||
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
|
||||
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end));
|
||||
leaf->children().push_back(newLeaf);
|
||||
newLeaf->parent() = leaf;
|
||||
}
|
||||
|
@ -6,39 +6,52 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "SyntaxNode.h"
|
||||
#include "SyntaxNodeCollection.h"
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include "exception.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
|
||||
// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
|
||||
// object. This is a wrapper around the ProcessAndStripXMLTags function.
|
||||
/** Parses string representations of parse trees in Moses' XML format and
|
||||
* converts them to SyntaxTree objects.
|
||||
*
|
||||
* This is a thin wrapper around the ProcessAndStripXMLTags function. After
|
||||
* calling Parse(), the output of the ProcessAndStripXMLTags function (the
|
||||
* sentence, node collection, label set, and top label set) are available via
|
||||
* accessors.
|
||||
*/
|
||||
class XmlTreeParser {
|
||||
public:
|
||||
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
|
||||
//! Parse a single sentence and return a SyntaxTree (with words attached).
|
||||
std::auto_ptr<SyntaxTree> Parse(const std::string &, bool unescape=false);
|
||||
|
||||
std::auto_ptr<SyntaxTree> Parse(const std::string &);
|
||||
//! Get the sentence string (as returned by ProcessAndStripXMLTags).
|
||||
const std::string &sentence() const { return sentence_; }
|
||||
|
||||
const std::vector<std::string>& GetWords() {
|
||||
return words_;
|
||||
}
|
||||
//! Get the sentence as a vector of words.
|
||||
const std::vector<std::string> &words() const { return words_; }
|
||||
|
||||
const SyntaxNodeCollection &GetNodeCollection() const {
|
||||
//! Get the node collection (as returned by ProcessAndStripXMLTags).
|
||||
const SyntaxNodeCollection &node_collection() const {
|
||||
return node_collection_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::set<std::string> &label_set_;
|
||||
std::map<std::string, int> &top_label_set_;
|
||||
std::string line_;
|
||||
SyntaxNodeCollection node_collection_;
|
||||
std::vector<std::string> words_;
|
||||
//! Get the label set (as returned by ProcessAndStripXMLTags).
|
||||
const std::set<std::string> &label_set() const { return label_set_; }
|
||||
|
||||
//! Get the top label set (as returned by ProcessAndStripXMLTags).
|
||||
const std::map<std::string, int> &top_label_set() const {
|
||||
return top_label_set_;
|
||||
}
|
||||
|
||||
private:
|
||||
void AttachWords(const std::vector<std::string> &, SyntaxTree &);
|
||||
|
||||
std::string sentence_;
|
||||
SyntaxNodeCollection node_collection_;
|
||||
std::set<std::string> label_set_;
|
||||
std::map<std::string, int> top_label_set_;
|
||||
std::vector<std::string> words_;
|
||||
};
|
||||
|
||||
} // namespace Syntax
|
||||
|
82
phrase-extract/syntax-common/xml_tree_writer.cc
Normal file
82
phrase-extract/syntax-common/xml_tree_writer.cc
Normal file
@ -0,0 +1,82 @@
|
||||
#include "xml_tree_writer.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <ostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "SyntaxTree.h"
|
||||
#include "XmlTree.h"
|
||||
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
|
||||
void XmlTreeWriter::Write(const SyntaxTree &tree) const {
|
||||
assert(!tree.IsLeaf());
|
||||
|
||||
// Opening tag
|
||||
out_ << "<tree label=\"" << Escape(tree.value().label) << "\"";
|
||||
for (SyntaxNode::AttributeMap::const_iterator
|
||||
p = tree.value().attributes.begin();
|
||||
p != tree.value().attributes.end(); ++p) {
|
||||
if (p->first != "label") {
|
||||
out_ << " " << p->first << "=\"" << p->second << "\"";
|
||||
}
|
||||
}
|
||||
out_ << ">";
|
||||
|
||||
// Children
|
||||
for (std::vector<SyntaxTree *>::const_iterator p = tree.children().begin();
|
||||
p != tree.children().end(); ++p) {
|
||||
SyntaxTree &child = **p;
|
||||
if (child.IsLeaf()) {
|
||||
out_ << " " << Escape(child.value().label);
|
||||
} else {
|
||||
out_ << " ";
|
||||
Write(child);
|
||||
}
|
||||
}
|
||||
|
||||
// Closing tag
|
||||
out_ << " </tree>";
|
||||
|
||||
if (tree.parent() == 0) {
|
||||
out_ << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Escapes XML special characters.
|
||||
std::string XmlTreeWriter::Escape(const std::string &s) const {
|
||||
if (!escape_) {
|
||||
return s;
|
||||
}
|
||||
std::string t;
|
||||
std::size_t len = s.size();
|
||||
t.reserve(len);
|
||||
for (std::size_t i = 0; i < len; ++i) {
|
||||
if (s[i] == '<') {
|
||||
t += "<";
|
||||
} else if (s[i] == '>') {
|
||||
t += ">";
|
||||
} else if (s[i] == '[') {
|
||||
t += "[";
|
||||
} else if (s[i] == ']') {
|
||||
t += "]";
|
||||
} else if (s[i] == '|') {
|
||||
t += "|";
|
||||
} else if (s[i] == '&') {
|
||||
t += "&";
|
||||
} else if (s[i] == '\'') {
|
||||
t += "'";
|
||||
} else if (s[i] == '"') {
|
||||
t += """;
|
||||
} else {
|
||||
t += s[i];
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
27
phrase-extract/syntax-common/xml_tree_writer.h
Normal file
27
phrase-extract/syntax-common/xml_tree_writer.h
Normal file
@ -0,0 +1,27 @@
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace Syntax {
|
||||
|
||||
class XmlTreeWriter {
|
||||
public:
|
||||
XmlTreeWriter(std::ostream &out, bool escape=true)
|
||||
: out_(out)
|
||||
, escape_(escape) {}
|
||||
|
||||
void Write(const SyntaxTree &) const;
|
||||
|
||||
private:
|
||||
std::string Escape(const std::string &) const;
|
||||
|
||||
std::ostream &out_;
|
||||
bool escape_;
|
||||
};
|
||||
|
||||
} // namespace Syntax
|
||||
} // namespace MosesTraining
|
Loading…
Reference in New Issue
Block a user