mosesdecoder/phrase-extract/postprocess-egret-forests/SplitPoint.cpp
Phil Williams 9e88f794e6 Add phrase-extract/postprocess-egret-forests
This performs some minor transformations to Egret forests: escaping of
Moses special characters; removal of "^g" suffixes from constituent labels;
and marking of slash/hyphen split points (using @ characters).
2015-03-10 13:51:30 +00:00

112 lines
3.0 KiB
C++

#include "SplitPoint.h"
#include <map>
#include <set>
#include <sstream>
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "syntax-common/exception.h"
namespace MosesTraining
{
namespace Syntax
{
namespace PostprocessEgretForests
{
void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints,
std::string &sentence)
{
if (splitPoints.empty()) {
return;
}
// FIXME Assumes all split points have same connector
std::string connector;
std::map<int, std::set<int> > points;
for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
p != splitPoints.end(); ++p) {
points[p->tokenPos].insert(p->charPos);
connector = p->connector;
}
// Split the sentence in to a sequence of tokens.
std::vector<std::string> terminals;
const util::AnyCharacter delim(" \t");
for (util::TokenIter<util::AnyCharacter, true> p(sentence, delim); p; ++p) {
terminals.resize(terminals.size()+1);
p->CopyToString(&terminals.back());
}
// Mark the split points.
for (std::map<int, std::set<int> >::const_iterator p = points.begin();
p != points.end(); ++p) {
std::string &word = terminals[p->first];
int offset = 0;
for (std::set<int>::const_iterator q = p->second.begin();
q != p->second.end(); ++q) {
std::string str = std::string("@") + connector + std::string("@");
word.replace(*q+offset, connector.size(), str);
offset += 2;
}
}
sentence.clear();
for (std::size_t i = 0; i < terminals.size(); ++i) {
if (i > 0) {
sentence += " ";
}
sentence += terminals[i];
}
}
void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints, Forest &forest)
{
if (splitPoints.empty()) {
return;
}
// FIXME Assumes all split points have same connector
std::string connector;
std::map<int, std::set<int> > points;
for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
p != splitPoints.end(); ++p) {
points[p->tokenPos].insert(p->charPos);
connector = p->connector;
}
// Get the terminal vertices in sentence order.
std::vector<Forest::Vertex *> terminals;
for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
if (!(*p)->incoming.empty()) {
continue;
}
int pos = (*p)->start;
if (pos >= terminals.size()) {
terminals.resize(pos+1);
}
terminals[pos] = p->get();
}
// Mark the split points.
for (std::map<int, std::set<int> >::const_iterator p = points.begin();
p != points.end(); ++p) {
std::string &word = terminals[p->first]->symbol.value;
int offset = 0;
for (std::set<int>::const_iterator q = p->second.begin();
q != p->second.end(); ++q) {
std::string str = std::string("@") + connector + std::string("@");
word.replace(*q+offset, connector.size(), str);
offset += 2;
}
}
}
} // namespace PostprocessEgretForests
} // namespace Syntax
} // namespace MosesTraining