Ongoing moses/phrase-extract refactoring

This commit is contained in:
Phil Williams 2015-06-02 15:23:41 +01:00
parent 5ece895ab4
commit 2f04d4a56e
12 changed files with 91 additions and 228 deletions

View File

@ -30,6 +30,8 @@
#include <boost/program_options.hpp>
#include "syntax-common/xml_tree_parser.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "SyntaxNode.h"
@ -50,7 +52,6 @@
#include "Span.h"
#include "StsgRule.h"
#include "StsgRuleWriter.h"
#include "XmlTreeParser.h"
namespace MosesTraining
{
@ -138,8 +139,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::string sourceLine;
std::string alignmentLine;
Alignment alignment;
XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;

View File

@ -1,90 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "XmlTreeParser.h"
#include <cassert>
#include <vector>
#include "util/tokenize.hh"
#include "SyntaxTree.h"
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
namespace MosesTraining
{
namespace GHKM
{
XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
std::map<std::string, int> &topLabelSet)
: m_labelSet(labelSet)
, m_topLabelSet(topLabelSet)
{
}
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
{
m_line = line;
m_nodeCollection.Clear();
try {
if (!ProcessAndStripXMLTags(m_line, m_nodeCollection, m_labelSet,
m_topLabelSet, false)) {
throw Exception("");
}
} catch (const XmlException &e) {
throw Exception(e.getMsg());
}
std::auto_ptr<SyntaxTree> root = m_nodeCollection.ExtractTree();
m_words = util::tokenize(m_line);
AttachWords(m_words, *root);
return root;
}
void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
SyntaxTree &root)
{
std::vector<SyntaxTree*> leaves;
leaves.reserve(words.size());
for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) {
leaves.push_back(&*p);
}
std::vector<std::string>::const_iterator q = words.begin();
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
++p) {
SyntaxTree *leaf = *p;
const int start = leaf->value().GetStart();
const int end = leaf->value().GetEnd();
if (start != end) {
std::ostringstream msg;
msg << "leaf node covers multiple words (" << start << "-" << end
<< "): this is currently unsupported";
throw Exception(msg.str());
}
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
leaf->children().push_back(newLeaf);
newLeaf->parent() = leaf;
}
}
} // namespace GHKM
} // namespace MosesTraining

View File

@ -1,71 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_
#define EXTRACT_GHKM_XML_TREE_PARSER_H_
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "SyntaxNode.h"
#include "SyntaxNodeCollection.h"
#include "SyntaxTree.h"
#include "Exception.h"
namespace MosesTraining
{
namespace GHKM
{
// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
// object.
class XmlTreeParser
{
public:
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
std::auto_ptr<SyntaxTree> Parse(const std::string &);
const std::vector<std::string>& GetWords() {
return m_words;
}
const SyntaxNodeCollection &GetNodeCollection() const {
return m_nodeCollection;
}
private:
std::set<std::string> &m_labelSet;
std::map<std::string, int> &m_topLabelSet;
std::string m_line;
SyntaxNodeCollection m_nodeCollection;
std::vector<std::string> m_words;
void AttachWords(const std::vector<std::string> &, SyntaxTree &);
};
} // namespace GHKM
} // namespace MosesTraining
#endif

View File

@ -82,7 +82,7 @@ int FilterRuleTable::Main(int argc, char *argv[])
StringCfgFilter filter(testStrings);
filter.Filter(std::cin, std::cout);
} else if (testSentenceFormat == kTree) {
std::vector<boost::shared_ptr<StringTree> > testTrees;
std::vector<boost::shared_ptr<SyntaxTree> > testTrees;
ReadTestSet(testStream, testTrees);
if (sourceSideRuleFormat == kCfg) {
// TODO Implement TreeCfgFilter
@ -124,9 +124,11 @@ void FilterRuleTable::ReadTestSet(
}
void FilterRuleTable::ReadTestSet(
std::istream &input, std::vector<boost::shared_ptr<StringTree> > &sentences)
std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
XmlTreeParser parser;
std::set<std::string> labelSet;
std::map<std::string, int> topLabelSet;
XmlTreeParser parser(labelSet, topLabelSet);
int lineNum = 0;
std::string line;
while (std::getline(input, line)) {
@ -136,7 +138,8 @@ void FilterRuleTable::ReadTestSet(
<< std::endl;
continue;
}
sentences.push_back(boost::shared_ptr<StringTree>(parser.Parse(line)));
sentences.push_back(
boost::shared_ptr<SyntaxTree>(parser.Parse(line).release()));
}
}

View File

@ -5,7 +5,7 @@
#include <boost/shared_ptr.hpp>
#include "syntax-common/string_tree.h"
#include "SyntaxTree.h"
#include "StringForest.h"
@ -36,7 +36,7 @@ private:
void Filter(const std::vector<std::vector<std::string> > &);
// Filter rule table (on std::cin) for test set (parse tree version).
void Filter(const std::vector<boost::shared_ptr<StringTree> > &);
void Filter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
void ProcessOptions(int, char *[], Options &) const;
@ -46,7 +46,7 @@ private:
// Read test set (tree version)
void ReadTestSet(std::istream &,
std::vector<boost::shared_ptr<StringTree> > &);
std::vector<boost::shared_ptr<SyntaxTree> > &);
// Read test set (forest version)
void ReadTestSet(std::istream &,

View File

@ -10,7 +10,6 @@
#include <boost/unordered_set.hpp>
#include "syntax-common/numbered_set.h"
#include "syntax-common/string_tree.h"
#include "syntax-common/tree.h"
#include "syntax-common/tree_fragment_tokenizer.h"

View File

@ -12,7 +12,7 @@ namespace FilterRuleTable
{
TreeCfgFilter::TreeCfgFilter(
const std::vector<boost::shared_ptr<StringTree> > &sentences)
const std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
}

View File

@ -8,8 +8,9 @@
#include <boost/shared_ptr.hpp>
#include <boost/unordered_map.hpp>
#include "SyntaxTree.h"
#include "syntax-common/numbered_set.h"
#include "syntax-common/string_tree.h"
#include "syntax-common/tree.h"
#include "syntax-common/tree_fragment_tokenizer.h"
@ -29,7 +30,7 @@ class TreeCfgFilter : public CfgFilter
{
public:
// Initialize the filter for a given set of test sentences.
TreeCfgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
TreeCfgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
void Filter(std::istream &in, std::ostream &out);
};

View File

@ -8,13 +8,13 @@ namespace FilterRuleTable
{
TreeTsgFilter::TreeTsgFilter(
const std::vector<boost::shared_ptr<StringTree> > &sentences)
const std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
{
// Convert each StringTree to an IdTree.
// Convert each SyntaxTree to an IdTree.
m_sentences.reserve(sentences.size());
for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
for (std::vector<boost::shared_ptr<SyntaxTree> >::const_iterator p =
sentences.begin(); p != sentences.end(); ++p) {
m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
m_sentences.push_back(boost::shared_ptr<IdTree>(SyntaxTreeToIdTree(**p)));
}
m_labelToTree.resize(m_testVocab.Size());
@ -25,15 +25,15 @@ TreeTsgFilter::TreeTsgFilter(
}
}
TreeTsgFilter::IdTree *TreeTsgFilter::StringTreeToIdTree(const StringTree &s)
TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
{
IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
const std::vector<StringTree*> &sChildren = s.children();
IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
const std::vector<SyntaxTree*> &sChildren = s.children();
std::vector<IdTree*> &tChildren = t->children();
tChildren.reserve(sChildren.size());
for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
for (std::vector<SyntaxTree*>::const_iterator p = sChildren.begin();
p != sChildren.end(); ++p) {
IdTree *child = StringTreeToIdTree(**p);
IdTree *child = SyntaxTreeToIdTree(**p);
child->parent() = t;
tChildren.push_back(child);
}

View File

@ -8,8 +8,9 @@
#include <boost/shared_ptr.hpp>
#include <boost/unordered_map.hpp>
#include "SyntaxTree.h"
#include "syntax-common/numbered_set.h"
#include "syntax-common/string_tree.h"
#include "syntax-common/tree.h"
#include "syntax-common/tree_fragment_tokenizer.h"
@ -29,7 +30,7 @@ class TreeTsgFilter : public TsgFilter
{
public:
// Initialize the filter for a given set of test sentences.
TreeTsgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
TreeTsgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
private:
// Add an entry to m_labelToTree for every subtree of the given tree.
@ -41,9 +42,9 @@ private:
// Try to match a fragment against a specific subtree of a test tree.
bool MatchFragment(const IdTree &, const IdTree &);
// Convert a StringTree to an IdTree (wrt m_testVocab). Inserts symbols into
// Convert a SyntaxTree to an IdTree (wrt m_testVocab). Inserts symbols into
// m_testVocab.
IdTree *StringTreeToIdTree(const StringTree &);
IdTree *SyntaxTreeToIdTree(const SyntaxTree &);
std::vector<boost::shared_ptr<IdTree> > m_sentences;
std::vector<std::vector<const IdTree *> > m_labelToTree;

View File

@ -1,17 +1,27 @@
#include "xml_tree_parser.h"
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
#include "util/tokenize.hh"
#include <cassert>
#include <vector>
#include "util/tokenize.hh"
#include "SyntaxTree.h"
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
namespace MosesTraining {
namespace Syntax {
StringTree *XmlTreeParser::Parse(const std::string &line) {
XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
std::map<std::string, int> &topLabelSet)
: label_set_(labelSet)
, top_label_set_(topLabelSet)
{
}
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
{
line_ = line;
node_collection_.Clear();
try {
@ -22,38 +32,37 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
} catch (const XmlException &e) {
throw Exception(e.getMsg());
}
node_collection_.ConnectNodes();
SyntaxNode *root = node_collection_.GetTop();
assert(root);
std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
words_ = util::tokenize(line_);
return ConvertTree(*root, words_);
AttachWords(words_, *root);
return root;
}
// Converts a SyntaxNode tree to a StringTree.
StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree,
const std::vector<std::string> &words) {
StringTree *root = new StringTree(tree.GetLabel());
const std::vector<SyntaxNode*> &children = tree.GetChildren();
if (children.empty()) {
if (tree.GetStart() != tree.GetEnd()) {
void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
SyntaxTree &root)
{
std::vector<SyntaxTree*> leaves;
leaves.reserve(words.size());
for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) {
leaves.push_back(&*p);
}
std::vector<std::string>::const_iterator q = words.begin();
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
++p) {
SyntaxTree *leaf = *p;
const int start = leaf->value().GetStart();
const int end = leaf->value().GetEnd();
if (start != end) {
std::ostringstream msg;
msg << "leaf node covers multiple words (" << tree.GetStart()
<< "-" << tree.GetEnd() << "): this is currently unsupported";
msg << "leaf node covers multiple words (" << start << "-" << end
<< "): this is currently unsupported";
throw Exception(msg.str());
}
StringTree *leaf = new StringTree(words[tree.GetStart()]);
leaf->parent() = root;
root->children().push_back(leaf);
} else {
for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
p != children.end(); ++p) {
assert(*p);
StringTree *child = ConvertTree(**p, words);
child->parent() = root;
root->children().push_back(child);
}
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
leaf->children().push_back(newLeaf);
newLeaf->parent() = leaf;
}
return root;
}
} // namespace Syntax

View File

@ -1,34 +1,44 @@
#pragma once
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "SyntaxNode.h"
#include "SyntaxNodeCollection.h"
#include "SyntaxTree.h"
#include "exception.h"
#include "string_tree.h"
namespace MosesTraining {
namespace Syntax {
// Parses a string in Moses' XML parse tree format and returns a StringTree
// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
// object. This is a wrapper around the ProcessAndStripXMLTags function.
class XmlTreeParser {
public:
StringTree *Parse(const std::string &);
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
std::auto_ptr<SyntaxTree> Parse(const std::string &);
const std::vector<std::string>& GetWords() {
return words_;
}
const SyntaxNodeCollection &GetNodeCollection() const {
return node_collection_;
}
private:
static StringTree *ConvertTree(const MosesTraining::SyntaxNode &,
const std::vector<std::string> &);
std::set<std::string> label_set_;
std::map<std::string, int> top_label_set_;
std::set<std::string> &label_set_;
std::map<std::string, int> &top_label_set_;
std::string line_;
MosesTraining::SyntaxNodeCollection node_collection_;
SyntaxNodeCollection node_collection_;
std::vector<std::string> words_;
void AttachWords(const std::vector<std::string> &, SyntaxTree &);
};
} // namespace Syntax