mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-05 02:22:21 +03:00
Merge ../mosesdecoder into perf_moses2
This commit is contained in:
commit
b75ef6f619
59
mert/M2.cpp
59
mert/M2.cpp
@ -3,21 +3,23 @@
|
||||
|
||||
#include "M2.h"
|
||||
|
||||
namespace MosesTuning {
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
namespace M2 {
|
||||
namespace M2
|
||||
{
|
||||
|
||||
bool Annot::lowercase = true;
|
||||
|
||||
std::string Annot::transform(const std::string& e) {
|
||||
std::string temp = e;
|
||||
if(lowercase) {
|
||||
boost::erase_all(temp, " ");
|
||||
return ToLower(temp);
|
||||
}
|
||||
else
|
||||
return e;
|
||||
}
|
||||
std::string Annot::transform(const std::string& e)
|
||||
{
|
||||
std::string temp = e;
|
||||
if(lowercase) {
|
||||
boost::erase_all(temp, " ");
|
||||
return ToLower(temp);
|
||||
} else
|
||||
return e;
|
||||
}
|
||||
|
||||
const std::string ToLower(const std::string& str)
|
||||
{
|
||||
@ -27,27 +29,30 @@ const std::string ToLower(const std::string& str)
|
||||
}
|
||||
|
||||
|
||||
Edit operator+(Edit& e1, Edit& e2) {
|
||||
std::string edit;
|
||||
if(e1.edit.size() > 0 && e2.edit.size() > 0)
|
||||
edit = e1.edit + " " + e2.edit;
|
||||
else if(e1.edit.size() > 0)
|
||||
edit = e1.edit;
|
||||
else if(e2.edit.size() > 0)
|
||||
edit = e2.edit;
|
||||
|
||||
return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit);
|
||||
Edit operator+(Edit& e1, Edit& e2)
|
||||
{
|
||||
std::string edit;
|
||||
if(e1.edit.size() > 0 && e2.edit.size() > 0)
|
||||
edit = e1.edit + " " + e2.edit;
|
||||
else if(e1.edit.size() > 0)
|
||||
edit = e1.edit;
|
||||
else if(e2.edit.size() > 0)
|
||||
edit = e2.edit;
|
||||
|
||||
return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit);
|
||||
}
|
||||
|
||||
|
||||
Edge operator+(Edge e1, Edge e2) {
|
||||
return Edge(e1.v, e2.u, e1.edit + e2.edit);
|
||||
Edge operator+(Edge e1, Edge e2)
|
||||
{
|
||||
return Edge(e1.v, e2.u, e1.edit + e2.edit);
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& o, Sentence s) {
|
||||
for(Sentence::iterator it = s.begin(); it != s.end(); it++)
|
||||
o << *it << " ";
|
||||
return o;
|
||||
std::ostream& operator<<(std::ostream& o, Sentence s)
|
||||
{
|
||||
for(Sentence::iterator it = s.begin(); it != s.end(); it++)
|
||||
o << *it << " ";
|
||||
return o;
|
||||
}
|
||||
|
||||
|
||||
|
712
mert/M2.h
712
mert/M2.h
@ -16,9 +16,11 @@
|
||||
|
||||
|
||||
|
||||
namespace MosesTuning {
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
namespace M2 {
|
||||
namespace M2
|
||||
{
|
||||
|
||||
typedef std::vector<float> Stats;
|
||||
|
||||
@ -29,44 +31,44 @@ std::ostream& operator<<(std::ostream& o, Sentence s);
|
||||
const std::string ToLower(const std::string& str);
|
||||
|
||||
struct Annot {
|
||||
size_t i;
|
||||
size_t j;
|
||||
|
||||
std::string type;
|
||||
std::string edit;
|
||||
|
||||
size_t annotator;
|
||||
|
||||
bool operator<(Annot a) const {
|
||||
return i < a.i || (i == a.i && j < a.j)
|
||||
|| (i == a.i && j == a.j && annotator < a.annotator)
|
||||
|| (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit));
|
||||
}
|
||||
size_t i;
|
||||
size_t j;
|
||||
|
||||
bool operator==(Annot a) const {
|
||||
return (!(*this < a) && !(a < *this));
|
||||
}
|
||||
std::string type;
|
||||
std::string edit;
|
||||
|
||||
static std::string transform(const std::string& e);
|
||||
size_t annotator;
|
||||
|
||||
static bool lowercase;
|
||||
bool operator<(Annot a) const {
|
||||
return i < a.i || (i == a.i && j < a.j)
|
||||
|| (i == a.i && j == a.j && annotator < a.annotator)
|
||||
|| (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit));
|
||||
}
|
||||
|
||||
bool operator==(Annot a) const {
|
||||
return (!(*this < a) && !(a < *this));
|
||||
}
|
||||
|
||||
static std::string transform(const std::string& e);
|
||||
|
||||
static bool lowercase;
|
||||
};
|
||||
|
||||
typedef std::set<Annot> Annots;
|
||||
typedef std::set<size_t> Users;
|
||||
|
||||
struct Unit {
|
||||
Sentence first;
|
||||
Annots second;
|
||||
Users third;
|
||||
Sentence first;
|
||||
Annots second;
|
||||
Users third;
|
||||
};
|
||||
|
||||
typedef std::vector<Unit> M2File;
|
||||
|
||||
struct Edit {
|
||||
Edit(float c = 1.0, size_t ch = 0, size_t unch = 1, std::string e = "")
|
||||
: cost(c), changed(ch), unchanged(unch), edit(e) {}
|
||||
|
||||
: cost(c), changed(ch), unchanged(unch), edit(e) {}
|
||||
|
||||
float cost;
|
||||
size_t changed;
|
||||
size_t unchanged;
|
||||
@ -77,7 +79,7 @@ Edit operator+(Edit& e1, Edit& e2);
|
||||
|
||||
struct Vertex {
|
||||
Vertex(size_t a = 0, size_t b = 0) : i(a), j(b) {}
|
||||
|
||||
|
||||
bool operator<(const Vertex &v) const {
|
||||
return i < v.i || (i == v.i && j < v.j);
|
||||
}
|
||||
@ -85,19 +87,19 @@ struct Vertex {
|
||||
bool operator==(const Vertex &v) const {
|
||||
return i == v.i && j == v.j;
|
||||
}
|
||||
|
||||
|
||||
size_t i;
|
||||
size_t j;
|
||||
};
|
||||
|
||||
struct Edge {
|
||||
Edge(Vertex vv = Vertex(), Vertex uu = Vertex(), Edit editt = Edit())
|
||||
: v(vv), u(uu), edit(editt) {}
|
||||
|
||||
: v(vv), u(uu), edit(editt) {}
|
||||
|
||||
bool operator<(const Edge &e) const {
|
||||
return v < e.v || (v == e.v && u < e.u);
|
||||
}
|
||||
|
||||
|
||||
Vertex v;
|
||||
Vertex u;
|
||||
Edit edit;
|
||||
@ -110,7 +112,7 @@ typedef std::vector<Row> Matrix;
|
||||
|
||||
struct Info {
|
||||
Info(Vertex vv = Vertex(), Edit editt = Edit())
|
||||
: v(vv), edit(editt) {}
|
||||
: v(vv), edit(editt) {}
|
||||
|
||||
bool operator<(const Info &i) const {
|
||||
return v < i.v;
|
||||
@ -127,352 +129,350 @@ typedef std::vector<TrackRow> TrackMatrix;
|
||||
typedef std::set<Vertex> Vertices;
|
||||
typedef std::set<Edge> Edges;
|
||||
|
||||
class M2 {
|
||||
private:
|
||||
M2File m_m2;
|
||||
|
||||
size_t m_max_unchanged;
|
||||
float m_beta;
|
||||
bool m_lowercase;
|
||||
bool m_verbose;
|
||||
|
||||
public:
|
||||
M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { }
|
||||
M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false)
|
||||
class M2
|
||||
{
|
||||
private:
|
||||
M2File m_m2;
|
||||
|
||||
size_t m_max_unchanged;
|
||||
float m_beta;
|
||||
bool m_lowercase;
|
||||
bool m_verbose;
|
||||
|
||||
public:
|
||||
M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { }
|
||||
M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false)
|
||||
: m_max_unchanged(max_unchanged), m_beta(beta), m_lowercase(!truecase), m_verbose(verbose) {
|
||||
if(!m_lowercase) {
|
||||
Annot::lowercase = false;
|
||||
}
|
||||
if(!m_lowercase) {
|
||||
Annot::lowercase = false;
|
||||
}
|
||||
|
||||
float Beta() {
|
||||
return m_beta;
|
||||
}
|
||||
|
||||
void ReadM2(const std::string& filename) {
|
||||
std::ifstream m2file(filename.c_str());
|
||||
std::string line;
|
||||
|
||||
Unit unit;
|
||||
bool first = true;
|
||||
|
||||
while(std::getline(m2file, line)) {
|
||||
if(line.size() > 2) {
|
||||
if(line.substr(0, 2) == "S ") {
|
||||
if(!first) {
|
||||
if(unit.third.empty())
|
||||
unit.third.insert(0);
|
||||
m_m2.push_back(unit);
|
||||
}
|
||||
first = false;
|
||||
|
||||
unit.first = Sentence();
|
||||
unit.second = Annots();
|
||||
|
||||
std::string sentenceLine = line.substr(2);
|
||||
boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on);
|
||||
}
|
||||
if(line.substr(0, 2) == "A ") {
|
||||
std::string annotLine = line.substr(2);
|
||||
|
||||
std::vector<std::string> annot;
|
||||
boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||"));
|
||||
|
||||
if(annot[1] != "noop") {
|
||||
Annot a;
|
||||
std::stringstream rangeStr(annot[0]);
|
||||
rangeStr >> a.i >> a.j;
|
||||
a.type = annot[1];
|
||||
a.edit = annot[2];
|
||||
|
||||
std::stringstream annotStr(annot[5]);
|
||||
annotStr >> a.annotator;
|
||||
|
||||
unit.third.insert(a.annotator);
|
||||
unit.second.insert(a);
|
||||
}
|
||||
else {
|
||||
std::stringstream annotStr(annot[5]);
|
||||
size_t annotator;
|
||||
annotStr >> annotator;
|
||||
unit.third.insert(annotator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float Beta() {
|
||||
return m_beta;
|
||||
}
|
||||
|
||||
void ReadM2(const std::string& filename) {
|
||||
std::ifstream m2file(filename.c_str());
|
||||
std::string line;
|
||||
|
||||
Unit unit;
|
||||
bool first = true;
|
||||
|
||||
while(std::getline(m2file, line)) {
|
||||
if(line.size() > 2) {
|
||||
if(line.substr(0, 2) == "S ") {
|
||||
if(!first) {
|
||||
if(unit.third.empty())
|
||||
unit.third.insert(0);
|
||||
m_m2.push_back(unit);
|
||||
}
|
||||
first = false;
|
||||
|
||||
unit.first = Sentence();
|
||||
unit.second = Annots();
|
||||
|
||||
std::string sentenceLine = line.substr(2);
|
||||
boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on);
|
||||
}
|
||||
if(unit.third.empty())
|
||||
unit.third.insert(0);
|
||||
m_m2.push_back(unit);
|
||||
}
|
||||
|
||||
size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) {
|
||||
size_t n = s1.size();
|
||||
size_t m = s2.size();
|
||||
|
||||
if (n == 0)
|
||||
return m;
|
||||
if (m == 0)
|
||||
return n;
|
||||
|
||||
d.resize(n + 1, Row(m + 1, 0));
|
||||
bt.resize(n + 1, TrackRow(m + 1));
|
||||
|
||||
for(size_t i = 0; i <= n; ++i) {
|
||||
d[i][0] = i;
|
||||
if(i > 0)
|
||||
bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, "")));
|
||||
}
|
||||
for(size_t j = 0; j <= m; ++j) {
|
||||
d[0][j] = j;
|
||||
if(j > 0)
|
||||
bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1])));
|
||||
}
|
||||
|
||||
int cost;
|
||||
for(size_t i = 1; i <= n; ++i) {
|
||||
for(size_t j = 1; j <= m; ++j) {
|
||||
if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1]))
|
||||
cost = 0;
|
||||
else
|
||||
cost = 2;
|
||||
|
||||
size_t left = d[i][j - 1] + 1;
|
||||
size_t down = d[i - 1][j] + 1;
|
||||
size_t diag = d[i - 1][j - 1] + cost;
|
||||
|
||||
d[i][j] = std::min(left, std::min(down, diag));
|
||||
|
||||
if(d[i][j] == left)
|
||||
bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1])));
|
||||
if(d[i][j] == down)
|
||||
bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, "")));
|
||||
if(d[i][j] == diag)
|
||||
bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) ));
|
||||
if(line.substr(0, 2) == "A ") {
|
||||
std::string annotLine = line.substr(2);
|
||||
|
||||
std::vector<std::string> annot;
|
||||
boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||"));
|
||||
|
||||
if(annot[1] != "noop") {
|
||||
Annot a;
|
||||
std::stringstream rangeStr(annot[0]);
|
||||
rangeStr >> a.i >> a.j;
|
||||
a.type = annot[1];
|
||||
a.edit = annot[2];
|
||||
|
||||
std::stringstream annotStr(annot[5]);
|
||||
annotStr >> a.annotator;
|
||||
|
||||
unit.third.insert(a.annotator);
|
||||
unit.second.insert(a);
|
||||
} else {
|
||||
std::stringstream annotStr(annot[5]);
|
||||
size_t annotator;
|
||||
annotStr >> annotator;
|
||||
unit.third.insert(annotator);
|
||||
}
|
||||
}
|
||||
return d[n][m];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) {
|
||||
Vertex start(bt.size() - 1, bt[0].size() - 1);
|
||||
|
||||
std::queue<Vertex> Q;
|
||||
Q.push(start);
|
||||
while(!Q.empty()) {
|
||||
Vertex v = Q.front();
|
||||
Q.pop();
|
||||
if(V.count(v) > 0)
|
||||
continue;
|
||||
V.insert(v);
|
||||
for(Track::iterator it = bt[v.i][v.j].begin();
|
||||
it != bt[v.i][v.j].end(); ++it) {
|
||||
Edge e(it->v, v, it->edit);
|
||||
E.insert(e);
|
||||
if(V.count(e.v) == 0)
|
||||
Q.push(e.v);
|
||||
}
|
||||
}
|
||||
|
||||
Edges newE;
|
||||
do {
|
||||
newE.clear();
|
||||
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
|
||||
for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) {
|
||||
if(it1->u == it2->v) {
|
||||
Edge e = *it1 + *it2;
|
||||
if(e.edit.changed > 0 &&
|
||||
e.edit.unchanged <= m_max_unchanged &&
|
||||
E.count(e) == 0)
|
||||
newE.insert(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
E.insert(newE.begin(), newE.end());
|
||||
} while(newE.size() > 0);
|
||||
if(unit.third.empty())
|
||||
unit.third.insert(0);
|
||||
m_m2.push_back(unit);
|
||||
}
|
||||
|
||||
size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) {
|
||||
size_t n = s1.size();
|
||||
size_t m = s2.size();
|
||||
|
||||
if (n == 0)
|
||||
return m;
|
||||
if (m == 0)
|
||||
return n;
|
||||
|
||||
d.resize(n + 1, Row(m + 1, 0));
|
||||
bt.resize(n + 1, TrackRow(m + 1));
|
||||
|
||||
for(size_t i = 0; i <= n; ++i) {
|
||||
d[i][0] = i;
|
||||
if(i > 0)
|
||||
bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, "")));
|
||||
}
|
||||
|
||||
void AddWeights(Edges &E, const Unit &u, size_t aid) {
|
||||
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
|
||||
if(it1->edit.changed > 0) {
|
||||
const_cast<float&>(it1->edit.cost) += 0.001;
|
||||
for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) {
|
||||
// if matches an annotator
|
||||
if(it1->v.i == it2->i && it1->u.i == it2->j
|
||||
&& Annot::transform(it1->edit.edit) == Annot::transform(it2->edit)
|
||||
&& it2->annotator == aid) {
|
||||
int newWeight = -(m_max_unchanged + 1) * E.size();
|
||||
const_cast<float&>(it1->edit.cost) = newWeight;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BellmanFord(Vertices &V, Edges &E) {
|
||||
Vertex source(0, 0);
|
||||
std::map<Vertex, float> distance;
|
||||
std::map<Vertex, Vertex> predecessor;
|
||||
|
||||
for(Vertices::iterator it = V.begin(); it != V.end(); ++it) {
|
||||
if(*it == source)
|
||||
distance[*it] = 0;
|
||||
else {
|
||||
distance[*it] = std::numeric_limits<float>::infinity();
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 1; i < V.size(); ++i) {
|
||||
for(Edges::iterator it = E.begin(); it != E.end(); ++it) {
|
||||
if(distance[it->v] + it->edit.cost < distance[it->u]) {
|
||||
distance[it->u] = distance[it->v] + it->edit.cost;
|
||||
predecessor[it->u] = it->v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Edges newE;
|
||||
|
||||
Vertex v = *V.rbegin();
|
||||
while(true) {
|
||||
//std::cout << predecessor[v] << " -> " << v << std::endl;
|
||||
Edges::iterator it = E.find(Edge(predecessor[v], v));
|
||||
if(it != E.end()) {
|
||||
Edge f = *it;
|
||||
//std::cout << f << std::endl;
|
||||
newE.insert(f);
|
||||
|
||||
v = predecessor[v];
|
||||
if(v == source)
|
||||
break;
|
||||
}
|
||||
else {
|
||||
std::cout << "Error" << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
E.clear();
|
||||
E.insert(newE.begin(), newE.end());
|
||||
for(size_t j = 0; j <= m; ++j) {
|
||||
d[0][j] = j;
|
||||
if(j > 0)
|
||||
bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1])));
|
||||
}
|
||||
|
||||
void AddStats(const std::vector<Edges> &Es, const Unit &u, Stats &stats, size_t line) {
|
||||
|
||||
std::map<size_t, Stats> statsPerAnnotator;
|
||||
for(std::set<size_t>::iterator it = u.third.begin();
|
||||
it != u.third.end(); ++it) {
|
||||
statsPerAnnotator[*it] = Stats(4, 0);
|
||||
}
|
||||
int cost;
|
||||
for(size_t i = 1; i <= n; ++i) {
|
||||
for(size_t j = 1; j <= m; ++j) {
|
||||
if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1]))
|
||||
cost = 0;
|
||||
else
|
||||
cost = 2;
|
||||
|
||||
for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++)
|
||||
statsPerAnnotator[it->annotator][2]++;
|
||||
size_t left = d[i][j - 1] + 1;
|
||||
size_t down = d[i - 1][j] + 1;
|
||||
size_t diag = d[i - 1][j - 1] + cost;
|
||||
|
||||
for(std::set<size_t>::iterator ait = u.third.begin();
|
||||
ait != u.third.end(); ++ait) {
|
||||
for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) {
|
||||
if(eit->edit.changed > 0) {
|
||||
statsPerAnnotator[*ait][1]++;
|
||||
Annot f;
|
||||
f.i = eit->v.i;
|
||||
f.j = eit->u.i;
|
||||
f.annotator = *ait;
|
||||
f.edit = eit->edit.edit;
|
||||
for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) {
|
||||
if(f == *fit)
|
||||
statsPerAnnotator[*ait][0]++;
|
||||
}
|
||||
}
|
||||
d[i][j] = std::min(left, std::min(down, diag));
|
||||
|
||||
if(d[i][j] == left)
|
||||
bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1])));
|
||||
if(d[i][j] == down)
|
||||
bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, "")));
|
||||
if(d[i][j] == diag)
|
||||
bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) ));
|
||||
}
|
||||
}
|
||||
return d[n][m];
|
||||
}
|
||||
|
||||
|
||||
void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) {
|
||||
Vertex start(bt.size() - 1, bt[0].size() - 1);
|
||||
|
||||
std::queue<Vertex> Q;
|
||||
Q.push(start);
|
||||
while(!Q.empty()) {
|
||||
Vertex v = Q.front();
|
||||
Q.pop();
|
||||
if(V.count(v) > 0)
|
||||
continue;
|
||||
V.insert(v);
|
||||
for(Track::iterator it = bt[v.i][v.j].begin();
|
||||
it != bt[v.i][v.j].end(); ++it) {
|
||||
Edge e(it->v, v, it->edit);
|
||||
E.insert(e);
|
||||
if(V.count(e.v) == 0)
|
||||
Q.push(e.v);
|
||||
}
|
||||
}
|
||||
|
||||
Edges newE;
|
||||
do {
|
||||
newE.clear();
|
||||
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
|
||||
for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) {
|
||||
if(it1->u == it2->v) {
|
||||
Edge e = *it1 + *it2;
|
||||
if(e.edit.changed > 0 &&
|
||||
e.edit.unchanged <= m_max_unchanged &&
|
||||
E.count(e) == 0)
|
||||
newE.insert(e);
|
||||
}
|
||||
}
|
||||
size_t bestAnnot = 0;
|
||||
float bestF = -1;
|
||||
for(std::set<size_t>::iterator it = u.third.begin();
|
||||
it != u.third.end(); ++it) {
|
||||
Stats localStats = stats;
|
||||
localStats[0] += statsPerAnnotator[*it][0];
|
||||
localStats[1] += statsPerAnnotator[*it][1];
|
||||
localStats[2] += statsPerAnnotator[*it][2];
|
||||
if(m_verbose)
|
||||
std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl;
|
||||
float f = FScore(localStats);
|
||||
if(m_verbose)
|
||||
std::cerr << f << std::endl;
|
||||
if(f > bestF) {
|
||||
bestF = f;
|
||||
bestAnnot = *it;
|
||||
}
|
||||
}
|
||||
E.insert(newE.begin(), newE.end());
|
||||
} while(newE.size() > 0);
|
||||
}
|
||||
|
||||
void AddWeights(Edges &E, const Unit &u, size_t aid) {
|
||||
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
|
||||
if(it1->edit.changed > 0) {
|
||||
const_cast<float&>(it1->edit.cost) += 0.001;
|
||||
for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) {
|
||||
// if matches an annotator
|
||||
if(it1->v.i == it2->i && it1->u.i == it2->j
|
||||
&& Annot::transform(it1->edit.edit) == Annot::transform(it2->edit)
|
||||
&& it2->annotator == aid) {
|
||||
int newWeight = -(m_max_unchanged + 1) * E.size();
|
||||
const_cast<float&>(it1->edit.cost) = newWeight;
|
||||
}
|
||||
}
|
||||
if(m_verbose)
|
||||
std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl;
|
||||
stats[0] += statsPerAnnotator[bestAnnot][0];
|
||||
stats[1] += statsPerAnnotator[bestAnnot][1];
|
||||
stats[2] += statsPerAnnotator[bestAnnot][2];
|
||||
}
|
||||
}
|
||||
|
||||
void SufStats(const std::string &sStr, size_t i, Stats &stats) {
|
||||
std::string temp = sStr;
|
||||
|
||||
Sentence s;
|
||||
boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on);
|
||||
|
||||
Unit &unit = m_m2[i];
|
||||
|
||||
Matrix d;
|
||||
TrackMatrix bt;
|
||||
size_t distance = LevenshteinMatrix(unit.first, s, d, bt);
|
||||
}
|
||||
|
||||
std::vector<Vertices> Vs(unit.third.size());
|
||||
std::vector<Edges> Es(unit.third.size());
|
||||
void BellmanFord(Vertices &V, Edges &E) {
|
||||
Vertex source(0, 0);
|
||||
std::map<Vertex, float> distance;
|
||||
std::map<Vertex, Vertex> predecessor;
|
||||
|
||||
if(distance > unit.first.size()) {
|
||||
std::cerr << "Levenshtein distance is greater than source size." << std::endl;
|
||||
stats[0] = 0;
|
||||
stats[1] = distance;
|
||||
stats[2] = 0;
|
||||
stats[3] = unit.first.size();
|
||||
return;
|
||||
}
|
||||
else if(distance > 0) {
|
||||
for(size_t j = 0; j < unit.third.size(); j++) {
|
||||
BuildGraph(bt, Vs[j], Es[j]);
|
||||
AddWeights(Es[j], unit, j);
|
||||
BellmanFord(Vs[j], Es[j]);
|
||||
}
|
||||
}
|
||||
AddStats(Es, unit, stats, i);
|
||||
stats[3] = unit.first.size();
|
||||
for(Vertices::iterator it = V.begin(); it != V.end(); ++it) {
|
||||
if(*it == source)
|
||||
distance[*it] = 0;
|
||||
else {
|
||||
distance[*it] = std::numeric_limits<float>::infinity();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
float FScore(const Stats& stats) {
|
||||
float p = 1.0;
|
||||
if(stats[1] != 0)
|
||||
p = (float)stats[0] / (float)stats[1];
|
||||
|
||||
float r = 1.0;
|
||||
if(stats[2] != 0)
|
||||
r = (float)stats[0] / (float)stats[2];
|
||||
|
||||
float denom = (m_beta * m_beta * p + r);
|
||||
float f = 0.0;
|
||||
if(denom != 0)
|
||||
f = ((1 + m_beta * m_beta) * p * r) / denom;
|
||||
return f;
|
||||
for(size_t i = 1; i < V.size(); ++i) {
|
||||
for(Edges::iterator it = E.begin(); it != E.end(); ++it) {
|
||||
if(distance[it->v] + it->edit.cost < distance[it->u]) {
|
||||
distance[it->u] = distance[it->v] + it->edit.cost;
|
||||
predecessor[it->u] = it->v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FScore(const Stats& stats, float &p, float &r, float &f) {
|
||||
p = 1.0;
|
||||
if(stats[1] != 0)
|
||||
p = (float)stats[0] / (float)stats[1];
|
||||
|
||||
r = 1.0;
|
||||
if(stats[2] != 0)
|
||||
r = (float)stats[0] / (float)stats[2];
|
||||
|
||||
float denom = (m_beta * m_beta * p + r);
|
||||
f = 0.0;
|
||||
if(denom != 0)
|
||||
f = ((1 + m_beta * m_beta) * p * r) / denom;
|
||||
|
||||
Edges newE;
|
||||
|
||||
Vertex v = *V.rbegin();
|
||||
while(true) {
|
||||
//std::cout << predecessor[v] << " -> " << v << std::endl;
|
||||
Edges::iterator it = E.find(Edge(predecessor[v], v));
|
||||
if(it != E.end()) {
|
||||
Edge f = *it;
|
||||
//std::cout << f << std::endl;
|
||||
newE.insert(f);
|
||||
|
||||
v = predecessor[v];
|
||||
if(v == source)
|
||||
break;
|
||||
} else {
|
||||
std::cout << "Error" << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
E.clear();
|
||||
E.insert(newE.begin(), newE.end());
|
||||
}
|
||||
|
||||
void AddStats(const std::vector<Edges> &Es, const Unit &u, Stats &stats, size_t line) {
|
||||
|
||||
std::map<size_t, Stats> statsPerAnnotator;
|
||||
for(std::set<size_t>::iterator it = u.third.begin();
|
||||
it != u.third.end(); ++it) {
|
||||
statsPerAnnotator[*it] = Stats(4, 0);
|
||||
}
|
||||
|
||||
for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++)
|
||||
statsPerAnnotator[it->annotator][2]++;
|
||||
|
||||
for(std::set<size_t>::iterator ait = u.third.begin();
|
||||
ait != u.third.end(); ++ait) {
|
||||
for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) {
|
||||
if(eit->edit.changed > 0) {
|
||||
statsPerAnnotator[*ait][1]++;
|
||||
Annot f;
|
||||
f.i = eit->v.i;
|
||||
f.j = eit->u.i;
|
||||
f.annotator = *ait;
|
||||
f.edit = eit->edit.edit;
|
||||
for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) {
|
||||
if(f == *fit)
|
||||
statsPerAnnotator[*ait][0]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
size_t bestAnnot = 0;
|
||||
float bestF = -1;
|
||||
for(std::set<size_t>::iterator it = u.third.begin();
|
||||
it != u.third.end(); ++it) {
|
||||
Stats localStats = stats;
|
||||
localStats[0] += statsPerAnnotator[*it][0];
|
||||
localStats[1] += statsPerAnnotator[*it][1];
|
||||
localStats[2] += statsPerAnnotator[*it][2];
|
||||
if(m_verbose)
|
||||
std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl;
|
||||
float f = FScore(localStats);
|
||||
if(m_verbose)
|
||||
std::cerr << f << std::endl;
|
||||
if(f > bestF) {
|
||||
bestF = f;
|
||||
bestAnnot = *it;
|
||||
}
|
||||
}
|
||||
if(m_verbose)
|
||||
std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl;
|
||||
stats[0] += statsPerAnnotator[bestAnnot][0];
|
||||
stats[1] += statsPerAnnotator[bestAnnot][1];
|
||||
stats[2] += statsPerAnnotator[bestAnnot][2];
|
||||
}
|
||||
|
||||
void SufStats(const std::string &sStr, size_t i, Stats &stats) {
|
||||
std::string temp = sStr;
|
||||
|
||||
Sentence s;
|
||||
boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on);
|
||||
|
||||
Unit &unit = m_m2[i];
|
||||
|
||||
Matrix d;
|
||||
TrackMatrix bt;
|
||||
size_t distance = LevenshteinMatrix(unit.first, s, d, bt);
|
||||
|
||||
std::vector<Vertices> Vs(unit.third.size());
|
||||
std::vector<Edges> Es(unit.third.size());
|
||||
|
||||
if(distance > unit.first.size()) {
|
||||
std::cerr << "Levenshtein distance is greater than source size." << std::endl;
|
||||
stats[0] = 0;
|
||||
stats[1] = distance;
|
||||
stats[2] = 0;
|
||||
stats[3] = unit.first.size();
|
||||
return;
|
||||
} else if(distance > 0) {
|
||||
for(size_t j = 0; j < unit.third.size(); j++) {
|
||||
BuildGraph(bt, Vs[j], Es[j]);
|
||||
AddWeights(Es[j], unit, j);
|
||||
BellmanFord(Vs[j], Es[j]);
|
||||
}
|
||||
}
|
||||
AddStats(Es, unit, stats, i);
|
||||
stats[3] = unit.first.size();
|
||||
}
|
||||
|
||||
|
||||
float FScore(const Stats& stats) {
|
||||
float p = 1.0;
|
||||
if(stats[1] != 0)
|
||||
p = (float)stats[0] / (float)stats[1];
|
||||
|
||||
float r = 1.0;
|
||||
if(stats[2] != 0)
|
||||
r = (float)stats[0] / (float)stats[2];
|
||||
|
||||
float denom = (m_beta * m_beta * p + r);
|
||||
float f = 0.0;
|
||||
if(denom != 0)
|
||||
f = ((1 + m_beta * m_beta) * p * r) / denom;
|
||||
return f;
|
||||
}
|
||||
|
||||
void FScore(const Stats& stats, float &p, float &r, float &f) {
|
||||
p = 1.0;
|
||||
if(stats[1] != 0)
|
||||
p = (float)stats[0] / (float)stats[1];
|
||||
|
||||
r = 1.0;
|
||||
if(stats[2] != 0)
|
||||
r = (float)stats[0] / (float)stats[2];
|
||||
|
||||
float denom = (m_beta * m_beta * p + r);
|
||||
f = 0.0;
|
||||
if(denom != 0)
|
||||
f = ((1 + m_beta * m_beta) * p * r) / denom;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -45,90 +45,92 @@ float M2Scorer::calculateScore(const vector<ScoreStatsType>& comps) const
|
||||
if (comps.size() != NumberOfScores()) {
|
||||
throw runtime_error("Size of stat vector for M2Scorer is not " + NumberOfScores());
|
||||
}
|
||||
|
||||
|
||||
float beta = beta_;
|
||||
|
||||
|
||||
|
||||
|
||||
float p = 0.0;
|
||||
float r = 0.0;
|
||||
float f = 0.0;
|
||||
|
||||
|
||||
if(comps[1] != 0)
|
||||
p = comps[0] / (double)comps[1];
|
||||
else
|
||||
p = 1.0;
|
||||
|
||||
|
||||
if(comps[2] != 0)
|
||||
r = comps[0] / (double)comps[2];
|
||||
else
|
||||
r = 1.0;
|
||||
|
||||
|
||||
float denom = beta * beta * p + r;
|
||||
if(denom != 0)
|
||||
f = (1.0 + beta * beta) * p * r / denom;
|
||||
else
|
||||
f = 0.0;
|
||||
|
||||
if(verbose_)
|
||||
if(verbose_)
|
||||
std::cerr << comps[0] << " " << comps[1] << " " << comps[2] << std::endl;
|
||||
|
||||
if(verbose_)
|
||||
if(verbose_)
|
||||
std::cerr << p << " " << r << " " << f << std::endl;
|
||||
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
float M2Scorer::getReferenceLength(const vector<ScoreStatsType>& comps) const {
|
||||
float M2Scorer::getReferenceLength(const vector<ScoreStatsType>& comps) const
|
||||
{
|
||||
return comps[3];
|
||||
}
|
||||
|
||||
std::vector<ScoreStatsType> randomStats(float decay, int max) {
|
||||
std::vector<ScoreStatsType> randomStats(float decay, int max)
|
||||
{
|
||||
int gold = rand() % max;
|
||||
int prop = rand() % max;
|
||||
int corr = 0.0;
|
||||
|
||||
|
||||
if(std::min(prop, gold) > 0)
|
||||
corr = rand() % std::min(prop, gold);
|
||||
|
||||
|
||||
//std::cerr << corr << " " << prop << " " << gold << std::endl;
|
||||
|
||||
|
||||
std::vector<ScoreStatsType> stats(3, 0.0);
|
||||
stats[0] = corr * decay;
|
||||
stats[1] = prop * decay;
|
||||
stats[2] = gold * decay;
|
||||
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
float sentenceM2(const std::vector<ScoreStatsType>& stats)
|
||||
{
|
||||
float beta = 0.5;
|
||||
|
||||
|
||||
std::vector<ScoreStatsType> smoothStats(3, 0.0); // = randomStats(0.001, 5);
|
||||
smoothStats[0] += stats[0];
|
||||
smoothStats[1] += stats[1];
|
||||
smoothStats[2] += stats[2];
|
||||
|
||||
smoothStats[0] += stats[0];
|
||||
smoothStats[1] += stats[1];
|
||||
smoothStats[2] += stats[2];
|
||||
|
||||
float p = 0.0;
|
||||
float r = 0.0;
|
||||
float f = 0.0;
|
||||
|
||||
|
||||
if(smoothStats[1] != 0)
|
||||
p = smoothStats[0] / smoothStats[1];
|
||||
else
|
||||
p = 1.0;
|
||||
|
||||
|
||||
if(smoothStats[2] != 0)
|
||||
r = smoothStats[0] / smoothStats[2];
|
||||
else
|
||||
r = 1.0;
|
||||
|
||||
|
||||
float denom = beta * beta * p + r;
|
||||
if(denom != 0)
|
||||
f = (1.0 + beta * beta) * p * r / denom;
|
||||
else
|
||||
f = 0.0;
|
||||
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
|
@ -31,15 +31,15 @@ public:
|
||||
virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
|
||||
virtual float getReferenceLength(const std::vector<ScoreStatsType>& comps) const;
|
||||
|
||||
private:
|
||||
float beta_;
|
||||
private:
|
||||
float beta_;
|
||||
int max_unchanged_words_;
|
||||
bool truecase_;
|
||||
bool verbose_;
|
||||
M2::M2 m2_;
|
||||
|
||||
|
||||
std::map<std::pair<size_t, std::string>, std::vector<ScoreStatsType> > seen_;
|
||||
|
||||
|
||||
// no copying allowed
|
||||
M2Scorer(const M2Scorer&);
|
||||
M2Scorer& operator=(const M2Scorer&);
|
||||
|
@ -23,21 +23,22 @@ namespace Moses
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::string MakePair(const std::string &s1, const std::string &s2, bool general) {
|
||||
std::string MakePair(const std::string &s1, const std::string &s2, bool general)
|
||||
{
|
||||
std::vector<std::string> sourceList;
|
||||
std::vector<std::string> targetList;
|
||||
|
||||
|
||||
if(general) {
|
||||
Diffs diffs = CreateDiff(s1, s2);
|
||||
|
||||
|
||||
size_t i = 0, j = 0;
|
||||
char lastType = 'm';
|
||||
|
||||
std::string source, target;
|
||||
|
||||
std::string source, target;
|
||||
std::string match;
|
||||
|
||||
|
||||
int count = 1;
|
||||
|
||||
|
||||
BOOST_FOREACH(Diff type, diffs) {
|
||||
if(type == 'm') {
|
||||
if(lastType != 'm') {
|
||||
@ -46,7 +47,7 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general)
|
||||
}
|
||||
source.clear();
|
||||
target.clear();
|
||||
|
||||
|
||||
if(s1[i] == '+') {
|
||||
if(match.size() >= 3) {
|
||||
sourceList.push_back("(\\w{3,})·");
|
||||
@ -54,56 +55,51 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general)
|
||||
sprintf((char*)temp.c_str(), "%d", count);
|
||||
targetList.push_back("\\" + temp + "·");
|
||||
count++;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
sourceList.push_back(match + "·");
|
||||
targetList.push_back(match + "·");
|
||||
targetList.push_back(match + "·");
|
||||
}
|
||||
match.clear();
|
||||
}
|
||||
else
|
||||
} else
|
||||
match.push_back(s1[i]);
|
||||
|
||||
|
||||
i++;
|
||||
j++;
|
||||
}
|
||||
else if(type == 'd') {
|
||||
} else if(type == 'd') {
|
||||
if(s1[i] == '+')
|
||||
source += "·";
|
||||
else
|
||||
source.push_back(s1[i]);
|
||||
i++;
|
||||
}
|
||||
else if(type == 'i') {
|
||||
} else if(type == 'i') {
|
||||
if(s2[j] == '+')
|
||||
target += "·";
|
||||
else
|
||||
target.push_back(s2[j]);
|
||||
j++;
|
||||
}
|
||||
if(type != 'm' && !match.empty()) {
|
||||
if(type != 'm' && !match.empty()) {
|
||||
if(match.size() >= 3) {
|
||||
sourceList.push_back("(\\w{3,})");
|
||||
std::string temp = "1";
|
||||
sprintf((char*)temp.c_str(), "%d", count);
|
||||
targetList.push_back("\\" + temp);
|
||||
count++;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
sourceList.push_back(match);
|
||||
targetList.push_back(match);
|
||||
targetList.push_back(match);
|
||||
}
|
||||
|
||||
|
||||
match.clear();
|
||||
}
|
||||
|
||||
|
||||
lastType = type;
|
||||
}
|
||||
if(lastType != 'm') {
|
||||
sourceList.push_back(source);
|
||||
targetList.push_back(target);
|
||||
}
|
||||
|
||||
|
||||
if(!match.empty()) {
|
||||
if(match.size() >= 3) {
|
||||
sourceList.push_back("(\\w{3,})");
|
||||
@ -111,45 +107,42 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general)
|
||||
sprintf((char*)temp.c_str(), "%d", count);
|
||||
targetList.push_back("\\"+ temp);
|
||||
count++;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
sourceList.push_back(match);
|
||||
targetList.push_back(match);
|
||||
targetList.push_back(match);
|
||||
}
|
||||
}
|
||||
match.clear();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
std::string cs1 = s1;
|
||||
std::string cs2 = s2;
|
||||
boost::replace_all(cs1, "+", "·");
|
||||
boost::replace_all(cs2, "+", "·");
|
||||
|
||||
|
||||
sourceList.push_back(cs1);
|
||||
targetList.push_back(cs2);
|
||||
}
|
||||
|
||||
|
||||
std::stringstream out;
|
||||
out << "sub(«";
|
||||
out << boost::join(sourceList, "");
|
||||
out << "»,«";
|
||||
out << boost::join(targetList, "");
|
||||
out << "»)";
|
||||
|
||||
|
||||
return out.str();
|
||||
}
|
||||
|
||||
std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const {
|
||||
std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const
|
||||
{
|
||||
std::stringstream out;
|
||||
if(s1.empty()) {
|
||||
out << "ins(«" << boost::join(s2, "·") << "»)";
|
||||
return out.str();
|
||||
}
|
||||
else if(s2.empty()) {
|
||||
} else if(s2.empty()) {
|
||||
out << "del(«" << boost::join(s1, "·") << "»)";
|
||||
return out.str();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
typename Tokens::value_type v1 = boost::join(s1, "+");
|
||||
typename Tokens::value_type v2 = boost::join(s2, "+");
|
||||
out << MakePair(v1, v2, m_general);
|
||||
@ -158,36 +151,36 @@ std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Token
|
||||
}
|
||||
|
||||
std::vector<std::string> GetContext(size_t pos,
|
||||
size_t len,
|
||||
size_t window,
|
||||
const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const std::vector<FactorType>& factorTypes,
|
||||
bool isRight) {
|
||||
size_t len,
|
||||
size_t window,
|
||||
const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const std::vector<FactorType>& factorTypes,
|
||||
bool isRight)
|
||||
{
|
||||
|
||||
const Sentence& sentence = static_cast<const Sentence&>(input);
|
||||
const Range& range = inputPath.GetWordsRange();
|
||||
|
||||
int leftPos = range.GetStartPos() + pos - len - 1;
|
||||
int rightPos = range.GetStartPos() + pos;
|
||||
|
||||
const Range& range = inputPath.GetWordsRange();
|
||||
|
||||
int leftPos = range.GetStartPos() + pos - len - 1;
|
||||
int rightPos = range.GetStartPos() + pos;
|
||||
|
||||
std::vector<std::string> contexts;
|
||||
|
||||
|
||||
for(int length = 1; length <= (int)window; ++length) {
|
||||
std::vector<std::string> current;
|
||||
if(!isRight) {
|
||||
for(int i = 0; i < length; i++) {
|
||||
if(leftPos - i >= 0) {
|
||||
current.push_back(sentence.GetWord(leftPos - i).GetString(factorTypes, false));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
current.push_back("<s>");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(current.back() == "<s>" && current.size() >= 2 && current[current.size()-2] == "<s>")
|
||||
continue;
|
||||
|
||||
|
||||
std::reverse(current.begin(), current.end());
|
||||
contexts.push_back("left(«" + boost::join(current, "·") + "»)_");
|
||||
}
|
||||
@ -195,8 +188,7 @@ std::vector<std::string> GetContext(size_t pos,
|
||||
for(int i = 0; i < length; i++) {
|
||||
if(rightPos + i < (int)sentence.GetSize()) {
|
||||
current.push_back(sentence.GetWord(rightPos + i).GetString(factorTypes, false));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
current.push_back("</s>");
|
||||
}
|
||||
}
|
||||
@ -206,7 +198,7 @@ std::vector<std::string> GetContext(size_t pos,
|
||||
|
||||
contexts.push_back("_right(«" + boost::join(current, "·") + "»)");
|
||||
}
|
||||
}
|
||||
}
|
||||
return contexts;
|
||||
}
|
||||
|
||||
@ -214,8 +206,9 @@ std::vector<std::string>
|
||||
CorrectionPattern::CreatePattern(const Tokens &s1,
|
||||
const Tokens &s2,
|
||||
const InputType &input,
|
||||
const InputPath &inputPath) const {
|
||||
|
||||
const InputPath &inputPath) const
|
||||
{
|
||||
|
||||
Diffs diffs = CreateDiff(s1, s2);
|
||||
size_t i = 0, j = 0;
|
||||
char lastType = 'm';
|
||||
@ -226,20 +219,20 @@ CorrectionPattern::CreatePattern(const Tokens &s1,
|
||||
if(lastType != 'm') {
|
||||
std::string pattern = CreateSinglePattern(source, target);
|
||||
patternList.push_back(pattern);
|
||||
|
||||
|
||||
if(m_context > 0) {
|
||||
std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false);
|
||||
std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true);
|
||||
|
||||
|
||||
BOOST_FOREACH(std::string left, leftContexts)
|
||||
patternList.push_back(left + pattern);
|
||||
patternList.push_back(left + pattern);
|
||||
|
||||
BOOST_FOREACH(std::string right, rightContexts)
|
||||
patternList.push_back(pattern + right);
|
||||
|
||||
patternList.push_back(pattern + right);
|
||||
|
||||
BOOST_FOREACH(std::string left, leftContexts)
|
||||
BOOST_FOREACH(std::string right, rightContexts)
|
||||
patternList.push_back(left + pattern + right);
|
||||
BOOST_FOREACH(std::string right, rightContexts)
|
||||
patternList.push_back(left + pattern + right);
|
||||
}
|
||||
}
|
||||
source.clear();
|
||||
@ -250,12 +243,10 @@ CorrectionPattern::CreatePattern(const Tokens &s1,
|
||||
}
|
||||
i++;
|
||||
j++;
|
||||
}
|
||||
else if(type == 'd') {
|
||||
} else if(type == 'd') {
|
||||
source.push_back(s1[i]);
|
||||
i++;
|
||||
}
|
||||
else if(type == 'i') {
|
||||
} else if(type == 'i') {
|
||||
target.push_back(s2[j]);
|
||||
j++;
|
||||
}
|
||||
@ -264,23 +255,23 @@ CorrectionPattern::CreatePattern(const Tokens &s1,
|
||||
if(lastType != 'm') {
|
||||
std::string pattern = CreateSinglePattern(source, target);
|
||||
patternList.push_back(pattern);
|
||||
|
||||
|
||||
if(m_context > 0) {
|
||||
std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false);
|
||||
std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true);
|
||||
|
||||
|
||||
BOOST_FOREACH(std::string left, leftContexts)
|
||||
patternList.push_back(left + pattern);
|
||||
patternList.push_back(left + pattern);
|
||||
|
||||
BOOST_FOREACH(std::string right, rightContexts)
|
||||
patternList.push_back(pattern + right);
|
||||
|
||||
patternList.push_back(pattern + right);
|
||||
|
||||
BOOST_FOREACH(std::string left, leftContexts)
|
||||
BOOST_FOREACH(std::string right, rightContexts)
|
||||
patternList.push_back(left + pattern + right);
|
||||
BOOST_FOREACH(std::string right, rightContexts)
|
||||
patternList.push_back(left + pattern + right);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return patternList;
|
||||
}
|
||||
|
||||
@ -308,36 +299,36 @@ void CorrectionPattern::SetParameter(const std::string& key, const std::string&
|
||||
}
|
||||
|
||||
void CorrectionPattern::EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore) const
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore) const
|
||||
{
|
||||
ComputeFeatures(input, inputPath, targetPhrase, &scoreBreakdown);
|
||||
}
|
||||
|
||||
void CorrectionPattern::ComputeFeatures(
|
||||
const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const TargetPhrase& target,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const TargetPhrase& target,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
const Phrase &source = inputPath.GetPhrase();
|
||||
|
||||
|
||||
std::vector<std::string> sourceTokens;
|
||||
for(size_t i = 0; i < source.GetSize(); ++i)
|
||||
sourceTokens.push_back(source.GetWord(i).GetString(m_factors, false));
|
||||
|
||||
|
||||
std::vector<std::string> targetTokens;
|
||||
for(size_t i = 0; i < target.GetSize(); ++i)
|
||||
targetTokens.push_back(target.GetWord(i).GetString(m_factors, false));
|
||||
|
||||
|
||||
std::vector<std::string> patternList = CreatePattern(sourceTokens, targetTokens, input, inputPath);
|
||||
for(size_t i = 0; i < patternList.size(); ++i)
|
||||
accumulator->PlusEquals(this, patternList[i], 1);
|
||||
|
||||
/*
|
||||
/*
|
||||
BOOST_FOREACH(std::string w, sourceTokens)
|
||||
std::cerr << w << " ";
|
||||
std::cerr << std::endl;
|
||||
|
@ -29,41 +29,41 @@ public:
|
||||
bool IsUseable(const FactorMask &mask) const;
|
||||
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{}
|
||||
|
||||
virtual void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
|
||||
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const
|
||||
{}
|
||||
|
||||
virtual void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
|
||||
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const
|
||||
{}
|
||||
|
||||
void EvaluateWhenApplied(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
|
||||
void ComputeFeatures(const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const TargetPhrase& targetPhrase,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
|
||||
std::vector<std::string> CreatePattern(const Tokens &s1,
|
||||
const Tokens &s2,
|
||||
const InputType &input,
|
||||
const InputPath &inputPath) const;
|
||||
|
||||
|
||||
std::string CreateSinglePattern(const Tokens &s1, const Tokens &s2) const;
|
||||
|
||||
};
|
||||
|
103
moses/FF/Diffs.h
103
moses/FF/Diffs.h
@ -11,22 +11,21 @@ typedef std::vector<Diff> Diffs;
|
||||
|
||||
template <class Sequence, class Pred>
|
||||
void CreateDiffRec(size_t** c,
|
||||
const Sequence &s1,
|
||||
const Sequence &s2,
|
||||
size_t start,
|
||||
size_t i,
|
||||
size_t j,
|
||||
Diffs& diffs,
|
||||
Pred pred) {
|
||||
const Sequence &s1,
|
||||
const Sequence &s2,
|
||||
size_t start,
|
||||
size_t i,
|
||||
size_t j,
|
||||
Diffs& diffs,
|
||||
Pred pred)
|
||||
{
|
||||
if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) {
|
||||
CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred);
|
||||
diffs.push_back(Diff('m'));
|
||||
}
|
||||
else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) {
|
||||
} else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) {
|
||||
CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred);
|
||||
diffs.push_back(Diff('i'));
|
||||
}
|
||||
else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) {
|
||||
} else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) {
|
||||
CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred);
|
||||
diffs.push_back(Diff('d'));
|
||||
}
|
||||
@ -34,17 +33,18 @@ void CreateDiffRec(size_t** c,
|
||||
|
||||
template <class Sequence, class Pred>
|
||||
Diffs CreateDiff(const Sequence& s1,
|
||||
const Sequence& s2,
|
||||
Pred pred) {
|
||||
|
||||
const Sequence& s2,
|
||||
Pred pred)
|
||||
{
|
||||
|
||||
Diffs diffs;
|
||||
|
||||
|
||||
size_t n = s2.size();
|
||||
|
||||
|
||||
int start = 0;
|
||||
int m_end = s1.size() - 1;
|
||||
int n_end = s2.size() - 1;
|
||||
|
||||
|
||||
while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) {
|
||||
diffs.push_back(Diff('m'));
|
||||
start++;
|
||||
@ -53,49 +53,51 @@ Diffs CreateDiff(const Sequence& s1,
|
||||
m_end--;
|
||||
n_end--;
|
||||
}
|
||||
|
||||
|
||||
size_t m_new = m_end - start + 1;
|
||||
size_t n_new = n_end - start + 1;
|
||||
|
||||
|
||||
size_t** c = new size_t*[m_new + 1];
|
||||
for(size_t i = 0; i <= m_new; ++i) {
|
||||
c[i] = new size_t[n_new + 1];
|
||||
c[i][0] = 0;
|
||||
}
|
||||
for(size_t j = 0; j <= n_new; ++j)
|
||||
c[0][j] = 0;
|
||||
c[0][j] = 0;
|
||||
for(size_t i = 1; i <= m_new; ++i)
|
||||
for(size_t j = 1; j <= n_new; ++j)
|
||||
if(pred(s1[i - 1 + start], s2[j - 1 + start]))
|
||||
c[i][j] = c[i-1][j-1] + 1;
|
||||
else
|
||||
c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j];
|
||||
|
||||
|
||||
CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred);
|
||||
|
||||
|
||||
for(size_t i = 0; i <= m_new; ++i)
|
||||
delete[] c[i];
|
||||
delete[] c;
|
||||
|
||||
|
||||
for (size_t i = n_end + 1; i < n; ++i)
|
||||
diffs.push_back(Diff('m'));
|
||||
|
||||
|
||||
return diffs;
|
||||
}
|
||||
|
||||
template <class Sequence>
|
||||
Diffs CreateDiff(const Sequence& s1, const Sequence& s2) {
|
||||
Diffs CreateDiff(const Sequence& s1, const Sequence& s2)
|
||||
{
|
||||
return CreateDiff(s1, s2, std::equal_to<typename Sequence::value_type>());
|
||||
}
|
||||
|
||||
template <class Sequence, class Sig, class Stats>
|
||||
void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats) {
|
||||
void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats)
|
||||
{
|
||||
if(sig.size() != stats.size())
|
||||
throw "Signature size differs from score array size.";
|
||||
|
||||
|
||||
size_t m = 0, d = 0, i = 0, s = 0;
|
||||
Diffs diff = CreateDiff(s1, s2);
|
||||
|
||||
Diffs diff = CreateDiff(s1, s2);
|
||||
|
||||
for(int j = 0; j < (int)diff.size(); ++j) {
|
||||
if(diff[j] == 'm')
|
||||
m++;
|
||||
@ -109,27 +111,36 @@ void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& sta
|
||||
k++;
|
||||
}
|
||||
j += k;
|
||||
}
|
||||
else if(diff[j] == 'i')
|
||||
} else if(diff[j] == 'i')
|
||||
i++;
|
||||
}
|
||||
|
||||
|
||||
for(size_t j = 0; j < sig.size(); ++j) {
|
||||
switch (sig[j]) {
|
||||
case 'l': stats[j] += d + i + s; break;
|
||||
case 'm': stats[j] += m; break;
|
||||
case 'd': stats[j] += d; break;
|
||||
case 'i': stats[j] += i; break;
|
||||
case 's': stats[j] += s; break;
|
||||
case 'r':
|
||||
float macc = 1;
|
||||
if (d + i + s + m)
|
||||
macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m);
|
||||
if(macc > 0)
|
||||
stats[j] += log(macc);
|
||||
else
|
||||
stats[j] += log(1.0/(float)(d + i + s + m + 1));
|
||||
break;
|
||||
case 'l':
|
||||
stats[j] += d + i + s;
|
||||
break;
|
||||
case 'm':
|
||||
stats[j] += m;
|
||||
break;
|
||||
case 'd':
|
||||
stats[j] += d;
|
||||
break;
|
||||
case 'i':
|
||||
stats[j] += i;
|
||||
break;
|
||||
case 's':
|
||||
stats[j] += s;
|
||||
break;
|
||||
case 'r':
|
||||
float macc = 1;
|
||||
if (d + i + s + m)
|
||||
macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m);
|
||||
if(macc > 0)
|
||||
stats[j] += log(macc);
|
||||
else
|
||||
stats[j] += log(1.0/(float)(d + i + s + m + 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -21,14 +21,15 @@ namespace Moses
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::string ParseScores(const std::string &line, const std::string& defaultScores) {
|
||||
std::string ParseScores(const std::string &line, const std::string& defaultScores)
|
||||
{
|
||||
std::vector<std::string> toks = Tokenize(line);
|
||||
UTIL_THROW_IF2(toks.empty(), "Empty line");
|
||||
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
std::vector<std::string> args = TokenizeFirstOnly(toks[i], "=");
|
||||
UTIL_THROW_IF2(args.size() != 2,
|
||||
"Incorrect format for feature function arg: " << toks[i]);
|
||||
"Incorrect format for feature function arg: " << toks[i]);
|
||||
|
||||
if (args[0] == "scores") {
|
||||
return args[1];
|
||||
@ -62,30 +63,29 @@ void EditOps::Load()
|
||||
{ }
|
||||
|
||||
void EditOps::EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &target
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
, const TargetPhrase &target
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
ComputeFeatures(source, target, &scoreBreakdown);
|
||||
}
|
||||
|
||||
void EditOps::ComputeFeatures(
|
||||
const Phrase &source,
|
||||
const TargetPhrase& target,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
const Phrase &source,
|
||||
const TargetPhrase& target,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
std::vector<float> ops(GetNumScoreComponents(), 0);
|
||||
|
||||
|
||||
if(m_chars) {
|
||||
std::vector<FactorType> factors;
|
||||
factors.push_back(m_factorType);
|
||||
|
||||
|
||||
std::string sourceStr = source.GetStringRep(factors);
|
||||
std::string targetStr = target.GetStringRep(factors);
|
||||
|
||||
|
||||
AddStats(sourceStr, targetStr, m_scores, ops);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
std::vector<std::string> sourceTokens;
|
||||
//std::cerr << "Ed src: ";
|
||||
for(size_t i = 0; i < source.GetSize(); ++i) {
|
||||
@ -94,7 +94,7 @@ void EditOps::ComputeFeatures(
|
||||
//std::cerr << sourceTokens.back() << " ";
|
||||
}
|
||||
//std::cerr << std::endl;
|
||||
|
||||
|
||||
std::vector<std::string> targetTokens;
|
||||
//std::cerr << "Ed trg: ";
|
||||
for(size_t i = 0; i < target.GetSize(); ++i) {
|
||||
@ -103,10 +103,10 @@ void EditOps::ComputeFeatures(
|
||||
//std::cerr << targetTokens.back() << " ";
|
||||
}
|
||||
//std::cerr << std::endl;
|
||||
|
||||
|
||||
AddStats(sourceTokens, targetTokens, m_scores, ops);
|
||||
}
|
||||
|
||||
|
||||
accumulator->PlusEquals(this, ops);
|
||||
}
|
||||
|
||||
|
@ -32,26 +32,26 @@ public:
|
||||
void Load();
|
||||
|
||||
virtual void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{}
|
||||
void EvaluateWhenApplied(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const
|
||||
{}
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const
|
||||
{}
|
||||
|
||||
void ComputeFeatures(const Phrase &source,
|
||||
const TargetPhrase& targetPhrase,
|
||||
|
@ -1540,6 +1540,150 @@ analysis-precision
|
||||
rerun-on-change: precision-by-coverage-base
|
||||
final-model: yes
|
||||
|
||||
[QUALITY-ESTIMATION] single
|
||||
tokenize-input
|
||||
in: raw-input
|
||||
out: tokenized-input
|
||||
default-name: quality-estimation/input.tok
|
||||
pass-unless: input-tokenizer
|
||||
template: $input-tokenizer < IN > OUT
|
||||
tokenize-input-devtest
|
||||
in: raw-input-devtest
|
||||
out: tokenized-input-devtest
|
||||
default-name: quality-estimation/input.devtest.tok
|
||||
pass-unless: input-tokenizer
|
||||
template: $input-tokenizer < IN > OUT
|
||||
lowercase-input
|
||||
in: tokenized-input
|
||||
out: truecased-input
|
||||
default-name: quality-estimation/input.lc
|
||||
pass-unless: input-lowercaser
|
||||
ignore-if: input-truecaser
|
||||
template: $input-lowercaser < IN > OUT
|
||||
lowercase-input-devtest
|
||||
in: tokenized-input-devtest
|
||||
out: truecased-input-devtest
|
||||
default-name: quality-estimation/input.devtest.lc
|
||||
pass-unless: input-lowercaser
|
||||
ignore-if: input-truecaser
|
||||
template: $input-lowercaser < IN > OUT
|
||||
truecase-input
|
||||
in: tokenized-input TRUECASER:truecase-model
|
||||
out: truecased-input
|
||||
rerun-on-change: input-truecaser
|
||||
default-name: quality-estimation/input.tc
|
||||
ignore-unless: input-truecaser
|
||||
template: $input-truecaser -model IN1.$input-extension < IN > OUT
|
||||
truecase-input-devtest
|
||||
in: tokenized-input-devtest TRUECASER:truecase-model
|
||||
out: truecased-input-devtest
|
||||
rerun-on-change: input-truecaser
|
||||
ignore-unless: input-truecaser
|
||||
default-name: quality-estimation/input.devtest.tc
|
||||
template: $input-truecaser -model IN1.$input-extension < IN > OUT
|
||||
split-input
|
||||
in: truecased-input SPLITTER:splitter-model
|
||||
out: split-input
|
||||
rerun-on-change: input-splitter
|
||||
default-name: quality-estimation/input.split
|
||||
pass-unless: input-splitter
|
||||
template: $input-splitter -model IN1.$input-extension < IN > OUT
|
||||
split-input-devtest
|
||||
in: truecased-input-devtest SPLITTER:splitter-model
|
||||
out: split-input-devtest
|
||||
rerun-on-change: input-splitter
|
||||
default-name: quality-estimation/input.devtest.split
|
||||
pass-unless: input-splitter
|
||||
template: $input-splitter -model IN1.$input-extension < IN > OUT
|
||||
tokenize-reference
|
||||
in: raw-reference
|
||||
out: tokenized-reference
|
||||
default-name: quality-estimation/reference.tok
|
||||
pass-unless: output-tokenizer
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-tokenizer < IN > OUT
|
||||
tokenize-reference-devtest
|
||||
in: raw-reference-devtest
|
||||
out: tokenized-reference-devtest
|
||||
default-name: quality-estimation/reference.devtest.tok
|
||||
pass-unless: output-tokenizer
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-tokenizer < IN > OUT
|
||||
lowercase-reference
|
||||
in: tokenized-reference
|
||||
out: truecased-reference
|
||||
default-name: quality-estimation/reference.lc
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
lowercase-reference-devtest
|
||||
in: tokenized-reference-devtest
|
||||
out: truecased-reference-devtest
|
||||
default-name: quality-estimation/reference.devtest.lc
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
truecase-reference
|
||||
in: tokenized-reference TRUECASER:truecase-model
|
||||
out: truecased-reference
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: quality-estimation/reference.tc
|
||||
ignore-unless: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
truecase-reference-devtest
|
||||
in: tokenized-reference-devtest TRUECASER:truecase-model
|
||||
out: truecased-reference-devtest
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: quality-estimation/reference.devtest.tc
|
||||
ignore-unless: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
decode
|
||||
in: TUNING:config-with-reused-weights split-input
|
||||
out: rich-output
|
||||
default-name: quality-estimation/output
|
||||
template: $decoder -v 0 -tt -f IN < IN1 > OUT
|
||||
error: Translation was not performed correctly
|
||||
not-error: trans: No such file or directory
|
||||
decode-devtest
|
||||
in: TUNING:config-with-reused-weights split-input-devtest
|
||||
out: rich-output-devtest
|
||||
default-name: quality-estimation/output-devtest
|
||||
template: $decoder -v 0 -tt -f IN < IN1 > OUT
|
||||
error: Translation was not performed correctly
|
||||
not-error: trans: No such file or directory
|
||||
remove-markup
|
||||
in: rich-output
|
||||
out: cleaned-output
|
||||
default-name: quality-estimation/tokenized-output
|
||||
template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT
|
||||
remove-markup-devtest
|
||||
in: rich-output-devtest
|
||||
out: cleaned-output-devtest
|
||||
default-name: quality-estimation/tokenized-output-devtest
|
||||
template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT
|
||||
score-output
|
||||
in: cleaned-output truecased-reference
|
||||
out: scored-output
|
||||
default-name: quality-estimation/output-scored
|
||||
tmp-name: quality-estimation/ter
|
||||
template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT
|
||||
score-output-devtest
|
||||
in: cleaned-output-devtest truecased-reference-devtest
|
||||
out: scored-output-devtest
|
||||
default-name: quality-estimation/output-scored-devtest
|
||||
tmp-name: quality-estimation/ter-devtest
|
||||
template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT
|
||||
train
|
||||
in: input rich-output scored-output input-devtest rich-output-devtest scored-output-devtest
|
||||
out: quality-estimation-model
|
||||
default-name: quality-estimation/model
|
||||
template: $trainer --train-rich IN1 --train-ter IN2 --eval-rich IN4 --eval-ter IN5 --model OUT
|
||||
final-model: yes
|
||||
|
||||
[REPORTING] single
|
||||
report
|
||||
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis
|
||||
|
42
scripts/ems/support/create-xml.perl
Executable file
42
scripts/ems/support/create-xml.perl
Executable file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my ($type) = @ARGV;
|
||||
if ($type =~ /^s/i) {
|
||||
print "<srcset setid=\"test\" srclang=\"any\">\n";
|
||||
print "<doc docid=\"doc\">\n";
|
||||
}
|
||||
elsif ($type =~ /^t/i) {
|
||||
print "<tstset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n";
|
||||
print "<doc sysid=\"moses\" docid=\"doc\">\n";
|
||||
}
|
||||
elsif ($type =~ /^r/i) {
|
||||
print "<refset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n";
|
||||
print "<doc sysid=\"ref\" docid=\"doc\">\n";
|
||||
}
|
||||
else {
|
||||
die("ERROR: specify source / target / ref");
|
||||
}
|
||||
|
||||
my $i = 0;
|
||||
while(<STDIN>) {
|
||||
chomp;
|
||||
print "<seg id=\"".(++$i)."\">$_</seg>\n";
|
||||
}
|
||||
|
||||
print "</doc>\n";
|
||||
|
||||
if ($type =~ /^s/i) {
|
||||
print "</srcset>\n";
|
||||
}
|
||||
elsif ($type =~ /^t/i) {
|
||||
print "</tstset>\n";
|
||||
}
|
||||
elsif ($type =~ /^r/i) {
|
||||
print "</refset>\n";
|
||||
}
|
@ -9,7 +9,16 @@ use strict;
|
||||
$|++;
|
||||
|
||||
while(<STDIN>) {
|
||||
s/ \|\d+\-\d+\| / /g;
|
||||
s/ \|\d+\-\d+\|$//;
|
||||
print $_;
|
||||
chop;
|
||||
s/\|[^\|]+\|//g;
|
||||
s/\s+/ /g;
|
||||
s/^ //;
|
||||
s/ $//;
|
||||
print $_."\n";
|
||||
}
|
||||
|
||||
#while(<STDIN>) {
|
||||
# s/ \|\d+\-\d+\| / /g;
|
||||
# s/ \|\d+\-\d+\|$//;
|
||||
# print $_;
|
||||
#}
|
||||
|
15
scripts/ems/support/ter.perl
Normal file
15
scripts/ems/support/ter.perl
Normal file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
|
||||
my ($jar, $hyp,$ref,$tmp) = @ARGV;
|
||||
`mkdir -p $tmp`;
|
||||
`$RealBin/create-xml.perl test < $hyp > $tmp/hyp`;
|
||||
`$RealBin/create-xml.perl ref < $ref > $tmp/ref`;
|
||||
`java -jar $jar -h $tmp/hyp -r $tmp/ref -o ter -n $tmp/out`;
|
||||
print `cat $tmp/out.ter`;
|
||||
|
Loading…
Reference in New Issue
Block a user