mosesdecoder/moses/PCNTools.cpp

208 lines
4.3 KiB
C++
Raw Normal View History

#include "PCNTools.h"
#include <iostream>
#include <cstdlib>
2013-09-06 00:22:53 +04:00
#include "Util.h"
#include "util/exception.hh"
2013-09-06 00:22:53 +04:00
using namespace std;
namespace PCN
{
const std::string chars = "'\\";
const char& quote = chars[0];
const char& slash = chars[1];
// safe get
inline char get(const std::string& in, int c)
{
if (c < 0 || c >= (int)in.size()) return 0;
else return in[(size_t)c];
}
// consume whitespace
inline void eatws(const std::string& in, int& c)
{
while (get(in,c) == ' ') {
c++;
}
}
2013-09-06 00:22:53 +04:00
std::string getString(const std::string& in, int &c)
{
std::string ret;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
2013-09-27 12:35:24 +04:00
ret += get(in,c++);
2013-09-06 00:22:53 +04:00
}
eatws(in,c);
return ret;
}
// from 'foo' return foo
std::string getEscapedString(const std::string& in, int &c)
{
eatws(in,c);
if (get(in,c++) != quote) return "ERROR";
std::string res;
char cur = 0;
do {
cur = get(in,c++);
if (cur == slash) {
res += get(in,c++);
} else if (cur != quote) {
res += cur;
}
} while (get(in,c) != quote && (c < (int)in.size()));
c++;
eatws(in,c);
return res;
}
// basically atof
float getFloat(const std::string& in, int &c)
{
std::string tmp;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
tmp += get(in,c++);
}
eatws(in,c);
return atof(tmp.c_str());
}
// basically atof
int getInt(const std::string& in, int &c)
{
std::string tmp;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
tmp += get(in,c++);
}
eatws(in,c);
return atoi(tmp.c_str());
}
// parse ('foo', 0.23)
CNAlt getCNAlt(const std::string& in, int &c)
{
if (get(in,c++) != '(') {
std::cerr << "PCN/PLF parse error: expected ( at start of cn alt block\n"; // throw "expected (";
return CNAlt();
}
std::string word = getEscapedString(in,c);
if (get(in,c++) != ',') {
std::cerr << "PCN/PLF parse error: expected , after string\n"; // throw "expected , after string";
return CNAlt();
}
size_t cnNext = 1;
2013-09-06 00:22:53 +04:00
// read all tokens after the 1st
std::vector<string> toks;
toks.push_back(getString(in,c));
while (get(in,c) == ',') {
c++;
2013-09-06 00:22:53 +04:00
string tok = getString(in,c);
toks.push_back(tok);
}
2013-09-06 00:22:53 +04:00
std::vector<float> probs;
// dense scores
size_t ind;
for (ind = 0; ind < toks.size() - 1; ++ind) {
2013-09-27 12:35:24 +04:00
const string &tok = toks[ind];
if (tok.find('=') == tok.npos) {
float val = Moses::Scan<float>(tok);
probs.push_back(val);
} else {
// beginning of sparse feature
break;
}
}
2013-09-06 00:22:53 +04:00
// sparse features
2013-09-06 00:26:19 +04:00
std::map<string, float> sparseFeatures;
2013-09-06 00:22:53 +04:00
for (; ind < toks.size() - 1; ++ind) {
2013-09-27 12:35:24 +04:00
const string &tok = toks[ind];
vector<string> keyValue = Moses::Tokenize(tok, "=");
2013-11-23 00:27:46 +04:00
UTIL_THROW_IF2(keyValue.size() != 2, "Format error: " << tok);
2013-09-27 12:35:24 +04:00
float prob = Moses::Scan<float>(keyValue[1]);
sparseFeatures[ keyValue[0] ] = prob;
2013-09-06 00:22:53 +04:00
}
//last item is column increment
cnNext = Moses::Scan<size_t>(toks.back());
if (get(in,c++) != ')') {
std::cerr << "PCN/PLF parse error: expected ) at end of cn alt block\n"; // throw "expected )";
return CNAlt();
}
eatws(in,c);
2013-09-06 01:34:22 +04:00
return CNAlt(word, probs, sparseFeatures, cnNext);
}
// parse (('foo', 0.23), ('bar', 0.77))
CNCol getCNCol(const std::string& in, int &c)
{
CNCol res;
if (get(in,c++) != '(') return res; // error
eatws(in,c);
while (1) {
if (c > (int)in.size()) {
break;
}
if (get(in,c) == ')') {
c++;
eatws(in,c);
break;
}
if (get(in,c) == ',' && get(in,c+1) == ')') {
c+=2;
eatws(in,c);
break;
}
if (get(in,c) == ',') {
c++;
eatws(in,c);
}
res.push_back(getCNAlt(in, c));
}
return res;
}
// parse ((('foo', 0.23), ('bar', 0.77)), (('a', 0.3), ('c', 0.7)))
CN parsePCN(const std::string& in)
{
CN res;
int c = 0;
if (in[c++] != '(') return res; // error
while (1) {
if (c > (int)in.size()) {
break;
}
if (get(in,c) == ')') {
c++;
eatws(in,c);
break;
}
if (get(in,c) == ',' && get(in,c+1) == ')') {
c+=2;
eatws(in,c);
break;
}
if (get(in,c) == ',') {
c++;
eatws(in,c);
}
res.push_back(getCNCol(in, c));
}
return res;
}
2011-10-30 16:17:15 +04:00
}