mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
9e88f794e6
This performs some minor transformations to Egret forests: escaping of Moses special characters; removal of "^g" suffixes from constituent labels; and marking of slash/hyphen split points (using @ characters).
112 lines
3.0 KiB
C++
112 lines
3.0 KiB
C++
#include "SplitPoint.h"
|
|
|
|
#include <map>
|
|
#include <set>
|
|
#include <sstream>
|
|
|
|
#include "util/string_piece.hh"
|
|
#include "util/tokenize_piece.hh"
|
|
|
|
#include "syntax-common/exception.h"
|
|
|
|
namespace MosesTraining
|
|
{
|
|
namespace Syntax
|
|
{
|
|
namespace PostprocessEgretForests
|
|
{
|
|
|
|
void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints,
|
|
std::string &sentence)
|
|
{
|
|
if (splitPoints.empty()) {
|
|
return;
|
|
}
|
|
|
|
// FIXME Assumes all split points have same connector
|
|
std::string connector;
|
|
std::map<int, std::set<int> > points;
|
|
for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
|
|
p != splitPoints.end(); ++p) {
|
|
points[p->tokenPos].insert(p->charPos);
|
|
connector = p->connector;
|
|
}
|
|
|
|
// Split the sentence in to a sequence of tokens.
|
|
std::vector<std::string> terminals;
|
|
const util::AnyCharacter delim(" \t");
|
|
for (util::TokenIter<util::AnyCharacter, true> p(sentence, delim); p; ++p) {
|
|
terminals.resize(terminals.size()+1);
|
|
p->CopyToString(&terminals.back());
|
|
}
|
|
|
|
// Mark the split points.
|
|
for (std::map<int, std::set<int> >::const_iterator p = points.begin();
|
|
p != points.end(); ++p) {
|
|
std::string &word = terminals[p->first];
|
|
int offset = 0;
|
|
for (std::set<int>::const_iterator q = p->second.begin();
|
|
q != p->second.end(); ++q) {
|
|
std::string str = std::string("@") + connector + std::string("@");
|
|
word.replace(*q+offset, connector.size(), str);
|
|
offset += 2;
|
|
}
|
|
}
|
|
|
|
sentence.clear();
|
|
for (std::size_t i = 0; i < terminals.size(); ++i) {
|
|
if (i > 0) {
|
|
sentence += " ";
|
|
}
|
|
sentence += terminals[i];
|
|
}
|
|
}
|
|
|
|
void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints, Forest &forest)
|
|
{
|
|
if (splitPoints.empty()) {
|
|
return;
|
|
}
|
|
|
|
// FIXME Assumes all split points have same connector
|
|
std::string connector;
|
|
std::map<int, std::set<int> > points;
|
|
for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
|
|
p != splitPoints.end(); ++p) {
|
|
points[p->tokenPos].insert(p->charPos);
|
|
connector = p->connector;
|
|
}
|
|
|
|
// Get the terminal vertices in sentence order.
|
|
std::vector<Forest::Vertex *> terminals;
|
|
for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
|
|
p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
|
|
if (!(*p)->incoming.empty()) {
|
|
continue;
|
|
}
|
|
int pos = (*p)->start;
|
|
if (pos >= terminals.size()) {
|
|
terminals.resize(pos+1);
|
|
}
|
|
terminals[pos] = p->get();
|
|
}
|
|
|
|
// Mark the split points.
|
|
for (std::map<int, std::set<int> >::const_iterator p = points.begin();
|
|
p != points.end(); ++p) {
|
|
std::string &word = terminals[p->first]->symbol.value;
|
|
int offset = 0;
|
|
for (std::set<int>::const_iterator q = p->second.begin();
|
|
q != p->second.end(); ++q) {
|
|
std::string str = std::string("@") + connector + std::string("@");
|
|
word.replace(*q+offset, connector.size(), str);
|
|
offset += 2;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
} // namespace PostprocessEgretForests
|
|
} // namespace Syntax
|
|
} // namespace MosesTraining
|