mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-06 03:33:37 +03:00
9e88f794e6
This performs some minor transformations to Egret forests: escaping of Moses special characters; removal of "^g" suffixes from constituent labels; and marking of slash/hyphen split points (using @ characters).
82 lines
2.0 KiB
C++
82 lines
2.0 KiB
C++
#include "SplitPointFileParser.h"
|
|
|
|
#include <istream>
|
|
#include <string>
|
|
|
|
#include "util/string_piece.hh"
|
|
#include "util/tokenize_piece.hh"
|
|
|
|
#include "syntax-common/exception.h"
|
|
|
|
namespace MosesTraining
|
|
{
|
|
namespace Syntax
|
|
{
|
|
namespace PostprocessEgretForests
|
|
{
|
|
|
|
SplitPointFileParser::SplitPointFileParser()
|
|
: m_input(0) {
|
|
}
|
|
|
|
SplitPointFileParser::SplitPointFileParser(std::istream &input)
|
|
: m_input(&input) {
|
|
++(*this);
|
|
}
|
|
|
|
SplitPointFileParser &SplitPointFileParser::operator++() {
|
|
if (!m_input) {
|
|
return *this;
|
|
}
|
|
m_entry.splitPoints.clear();
|
|
if (!std::getline(*m_input, m_tmpLine)) {
|
|
m_input = 0;
|
|
return *this;
|
|
}
|
|
ParseLine(m_tmpLine, m_entry.splitPoints);
|
|
return *this;
|
|
}
|
|
|
|
void SplitPointFileParser::ParseLine(const std::string &line,
|
|
std::vector<SplitPoint> &splitPoints)
|
|
{
|
|
std::string tmp;
|
|
const util::AnyCharacter delimiter(" \t");
|
|
for (util::TokenIter<util::AnyCharacter, true> p(line, delimiter); p; ++p) {
|
|
splitPoints.resize(splitPoints.size()+1);
|
|
SplitPoint &splitPoint = splitPoints.back();
|
|
std::size_t pos = p->find(',');
|
|
|
|
StringPiece sp = p->substr(0, pos);
|
|
sp.CopyToString(&tmp);
|
|
splitPoint.tokenPos = std::atoi(tmp.c_str());
|
|
std::size_t begin = pos+1;
|
|
pos = p->find(',', begin);
|
|
|
|
sp = p->substr(begin, pos-begin);
|
|
sp.CopyToString(&tmp);
|
|
splitPoint.charPos = std::atoi(tmp.c_str());
|
|
|
|
sp = p->substr(pos+1);
|
|
sp.CopyToString(&splitPoint.connector);
|
|
if (splitPoint.connector.size() > 1) {
|
|
throw Exception("multi-character connectors not currently supported");
|
|
}
|
|
}
|
|
}
|
|
|
|
bool operator==(const SplitPointFileParser &lhs,
|
|
const SplitPointFileParser &rhs) {
|
|
// TODO Is this right? Compare values of istreams if non-zero?
|
|
return lhs.m_input == rhs.m_input;
|
|
}
|
|
|
|
bool operator!=(const SplitPointFileParser &lhs,
|
|
const SplitPointFileParser &rhs) {
|
|
return !(lhs == rhs);
|
|
}
|
|
|
|
} // namespace PostprocessEgretForests
|
|
} // namespace Syntax
|
|
} // namespace MosesTraining
|