daily automatic beautifier

This commit is contained in:
MosesAdmin 2016-06-02 00:00:39 +01:00
parent 2ad1eacef7
commit ea306f62b7
9 changed files with 606 additions and 597 deletions

View File

@ -3,21 +3,23 @@
#include "M2.h" #include "M2.h"
namespace MosesTuning { namespace MosesTuning
{
namespace M2 { namespace M2
{
bool Annot::lowercase = true; bool Annot::lowercase = true;
std::string Annot::transform(const std::string& e) { std::string Annot::transform(const std::string& e)
std::string temp = e; {
if(lowercase) { std::string temp = e;
boost::erase_all(temp, " "); if(lowercase) {
return ToLower(temp); boost::erase_all(temp, " ");
} return ToLower(temp);
else } else
return e; return e;
} }
const std::string ToLower(const std::string& str) const std::string ToLower(const std::string& str)
{ {
@ -27,27 +29,30 @@ const std::string ToLower(const std::string& str)
} }
Edit operator+(Edit& e1, Edit& e2) { Edit operator+(Edit& e1, Edit& e2)
std::string edit; {
if(e1.edit.size() > 0 && e2.edit.size() > 0) std::string edit;
edit = e1.edit + " " + e2.edit; if(e1.edit.size() > 0 && e2.edit.size() > 0)
else if(e1.edit.size() > 0) edit = e1.edit + " " + e2.edit;
edit = e1.edit; else if(e1.edit.size() > 0)
else if(e2.edit.size() > 0) edit = e1.edit;
edit = e2.edit; else if(e2.edit.size() > 0)
edit = e2.edit;
return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit);
return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit);
} }
Edge operator+(Edge e1, Edge e2) { Edge operator+(Edge e1, Edge e2)
return Edge(e1.v, e2.u, e1.edit + e2.edit); {
return Edge(e1.v, e2.u, e1.edit + e2.edit);
} }
std::ostream& operator<<(std::ostream& o, Sentence s) { std::ostream& operator<<(std::ostream& o, Sentence s)
for(Sentence::iterator it = s.begin(); it != s.end(); it++) {
o << *it << " "; for(Sentence::iterator it = s.begin(); it != s.end(); it++)
return o; o << *it << " ";
return o;
} }

712
mert/M2.h
View File

@ -16,9 +16,11 @@
namespace MosesTuning { namespace MosesTuning
{
namespace M2 { namespace M2
{
typedef std::vector<float> Stats; typedef std::vector<float> Stats;
@ -29,44 +31,44 @@ std::ostream& operator<<(std::ostream& o, Sentence s);
const std::string ToLower(const std::string& str); const std::string ToLower(const std::string& str);
struct Annot { struct Annot {
size_t i; size_t i;
size_t j; size_t j;
std::string type;
std::string edit;
size_t annotator;
bool operator<(Annot a) const {
return i < a.i || (i == a.i && j < a.j)
|| (i == a.i && j == a.j && annotator < a.annotator)
|| (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit));
}
bool operator==(Annot a) const { std::string type;
return (!(*this < a) && !(a < *this)); std::string edit;
}
static std::string transform(const std::string& e); size_t annotator;
static bool lowercase; bool operator<(Annot a) const {
return i < a.i || (i == a.i && j < a.j)
|| (i == a.i && j == a.j && annotator < a.annotator)
|| (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit));
}
bool operator==(Annot a) const {
return (!(*this < a) && !(a < *this));
}
static std::string transform(const std::string& e);
static bool lowercase;
}; };
typedef std::set<Annot> Annots; typedef std::set<Annot> Annots;
typedef std::set<size_t> Users; typedef std::set<size_t> Users;
struct Unit { struct Unit {
Sentence first; Sentence first;
Annots second; Annots second;
Users third; Users third;
}; };
typedef std::vector<Unit> M2File; typedef std::vector<Unit> M2File;
struct Edit { struct Edit {
Edit(float c = 1.0, size_t ch = 0, size_t unch = 1, std::string e = "") Edit(float c = 1.0, size_t ch = 0, size_t unch = 1, std::string e = "")
: cost(c), changed(ch), unchanged(unch), edit(e) {} : cost(c), changed(ch), unchanged(unch), edit(e) {}
float cost; float cost;
size_t changed; size_t changed;
size_t unchanged; size_t unchanged;
@ -77,7 +79,7 @@ Edit operator+(Edit& e1, Edit& e2);
struct Vertex { struct Vertex {
Vertex(size_t a = 0, size_t b = 0) : i(a), j(b) {} Vertex(size_t a = 0, size_t b = 0) : i(a), j(b) {}
bool operator<(const Vertex &v) const { bool operator<(const Vertex &v) const {
return i < v.i || (i == v.i && j < v.j); return i < v.i || (i == v.i && j < v.j);
} }
@ -85,19 +87,19 @@ struct Vertex {
bool operator==(const Vertex &v) const { bool operator==(const Vertex &v) const {
return i == v.i && j == v.j; return i == v.i && j == v.j;
} }
size_t i; size_t i;
size_t j; size_t j;
}; };
struct Edge { struct Edge {
Edge(Vertex vv = Vertex(), Vertex uu = Vertex(), Edit editt = Edit()) Edge(Vertex vv = Vertex(), Vertex uu = Vertex(), Edit editt = Edit())
: v(vv), u(uu), edit(editt) {} : v(vv), u(uu), edit(editt) {}
bool operator<(const Edge &e) const { bool operator<(const Edge &e) const {
return v < e.v || (v == e.v && u < e.u); return v < e.v || (v == e.v && u < e.u);
} }
Vertex v; Vertex v;
Vertex u; Vertex u;
Edit edit; Edit edit;
@ -110,7 +112,7 @@ typedef std::vector<Row> Matrix;
struct Info { struct Info {
Info(Vertex vv = Vertex(), Edit editt = Edit()) Info(Vertex vv = Vertex(), Edit editt = Edit())
: v(vv), edit(editt) {} : v(vv), edit(editt) {}
bool operator<(const Info &i) const { bool operator<(const Info &i) const {
return v < i.v; return v < i.v;
@ -127,352 +129,350 @@ typedef std::vector<TrackRow> TrackMatrix;
typedef std::set<Vertex> Vertices; typedef std::set<Vertex> Vertices;
typedef std::set<Edge> Edges; typedef std::set<Edge> Edges;
class M2 { class M2
private: {
M2File m_m2; private:
M2File m_m2;
size_t m_max_unchanged;
float m_beta; size_t m_max_unchanged;
bool m_lowercase; float m_beta;
bool m_verbose; bool m_lowercase;
bool m_verbose;
public:
M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { } public:
M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false) M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { }
M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false)
: m_max_unchanged(max_unchanged), m_beta(beta), m_lowercase(!truecase), m_verbose(verbose) { : m_max_unchanged(max_unchanged), m_beta(beta), m_lowercase(!truecase), m_verbose(verbose) {
if(!m_lowercase) { if(!m_lowercase) {
Annot::lowercase = false; Annot::lowercase = false;
}
} }
}
float Beta() {
return m_beta; float Beta() {
} return m_beta;
}
void ReadM2(const std::string& filename) {
std::ifstream m2file(filename.c_str()); void ReadM2(const std::string& filename) {
std::string line; std::ifstream m2file(filename.c_str());
std::string line;
Unit unit;
bool first = true; Unit unit;
bool first = true;
while(std::getline(m2file, line)) {
if(line.size() > 2) { while(std::getline(m2file, line)) {
if(line.substr(0, 2) == "S ") { if(line.size() > 2) {
if(!first) { if(line.substr(0, 2) == "S ") {
if(unit.third.empty()) if(!first) {
unit.third.insert(0); if(unit.third.empty())
m_m2.push_back(unit); unit.third.insert(0);
} m_m2.push_back(unit);
first = false; }
first = false;
unit.first = Sentence();
unit.second = Annots(); unit.first = Sentence();
unit.second = Annots();
std::string sentenceLine = line.substr(2);
boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on); std::string sentenceLine = line.substr(2);
} boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on);
if(line.substr(0, 2) == "A ") {
std::string annotLine = line.substr(2);
std::vector<std::string> annot;
boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||"));
if(annot[1] != "noop") {
Annot a;
std::stringstream rangeStr(annot[0]);
rangeStr >> a.i >> a.j;
a.type = annot[1];
a.edit = annot[2];
std::stringstream annotStr(annot[5]);
annotStr >> a.annotator;
unit.third.insert(a.annotator);
unit.second.insert(a);
}
else {
std::stringstream annotStr(annot[5]);
size_t annotator;
annotStr >> annotator;
unit.third.insert(annotator);
}
}
}
} }
if(unit.third.empty()) if(line.substr(0, 2) == "A ") {
unit.third.insert(0); std::string annotLine = line.substr(2);
m_m2.push_back(unit);
} std::vector<std::string> annot;
boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||"));
size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) {
size_t n = s1.size(); if(annot[1] != "noop") {
size_t m = s2.size(); Annot a;
std::stringstream rangeStr(annot[0]);
if (n == 0) rangeStr >> a.i >> a.j;
return m; a.type = annot[1];
if (m == 0) a.edit = annot[2];
return n;
std::stringstream annotStr(annot[5]);
d.resize(n + 1, Row(m + 1, 0)); annotStr >> a.annotator;
bt.resize(n + 1, TrackRow(m + 1));
unit.third.insert(a.annotator);
for(size_t i = 0; i <= n; ++i) { unit.second.insert(a);
d[i][0] = i; } else {
if(i > 0) std::stringstream annotStr(annot[5]);
bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, ""))); size_t annotator;
} annotStr >> annotator;
for(size_t j = 0; j <= m; ++j) { unit.third.insert(annotator);
d[0][j] = j;
if(j > 0)
bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1])));
}
int cost;
for(size_t i = 1; i <= n; ++i) {
for(size_t j = 1; j <= m; ++j) {
if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1]))
cost = 0;
else
cost = 2;
size_t left = d[i][j - 1] + 1;
size_t down = d[i - 1][j] + 1;
size_t diag = d[i - 1][j - 1] + cost;
d[i][j] = std::min(left, std::min(down, diag));
if(d[i][j] == left)
bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1])));
if(d[i][j] == down)
bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, "")));
if(d[i][j] == diag)
bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) ));
} }
} }
return d[n][m]; }
} }
if(unit.third.empty())
unit.third.insert(0);
void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) { m_m2.push_back(unit);
Vertex start(bt.size() - 1, bt[0].size() - 1); }
std::queue<Vertex> Q; size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) {
Q.push(start); size_t n = s1.size();
while(!Q.empty()) { size_t m = s2.size();
Vertex v = Q.front();
Q.pop(); if (n == 0)
if(V.count(v) > 0) return m;
continue; if (m == 0)
V.insert(v); return n;
for(Track::iterator it = bt[v.i][v.j].begin();
it != bt[v.i][v.j].end(); ++it) { d.resize(n + 1, Row(m + 1, 0));
Edge e(it->v, v, it->edit); bt.resize(n + 1, TrackRow(m + 1));
E.insert(e);
if(V.count(e.v) == 0) for(size_t i = 0; i <= n; ++i) {
Q.push(e.v); d[i][0] = i;
} if(i > 0)
} bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, "")));
Edges newE;
do {
newE.clear();
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) {
if(it1->u == it2->v) {
Edge e = *it1 + *it2;
if(e.edit.changed > 0 &&
e.edit.unchanged <= m_max_unchanged &&
E.count(e) == 0)
newE.insert(e);
}
}
}
E.insert(newE.begin(), newE.end());
} while(newE.size() > 0);
} }
for(size_t j = 0; j <= m; ++j) {
void AddWeights(Edges &E, const Unit &u, size_t aid) { d[0][j] = j;
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { if(j > 0)
if(it1->edit.changed > 0) { bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1])));
const_cast<float&>(it1->edit.cost) += 0.001;
for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) {
// if matches an annotator
if(it1->v.i == it2->i && it1->u.i == it2->j
&& Annot::transform(it1->edit.edit) == Annot::transform(it2->edit)
&& it2->annotator == aid) {
int newWeight = -(m_max_unchanged + 1) * E.size();
const_cast<float&>(it1->edit.cost) = newWeight;
}
}
}
}
}
void BellmanFord(Vertices &V, Edges &E) {
Vertex source(0, 0);
std::map<Vertex, float> distance;
std::map<Vertex, Vertex> predecessor;
for(Vertices::iterator it = V.begin(); it != V.end(); ++it) {
if(*it == source)
distance[*it] = 0;
else {
distance[*it] = std::numeric_limits<float>::infinity();
}
}
for(size_t i = 1; i < V.size(); ++i) {
for(Edges::iterator it = E.begin(); it != E.end(); ++it) {
if(distance[it->v] + it->edit.cost < distance[it->u]) {
distance[it->u] = distance[it->v] + it->edit.cost;
predecessor[it->u] = it->v;
}
}
}
Edges newE;
Vertex v = *V.rbegin();
while(true) {
//std::cout << predecessor[v] << " -> " << v << std::endl;
Edges::iterator it = E.find(Edge(predecessor[v], v));
if(it != E.end()) {
Edge f = *it;
//std::cout << f << std::endl;
newE.insert(f);
v = predecessor[v];
if(v == source)
break;
}
else {
std::cout << "Error" << std::endl;
break;
}
}
E.clear();
E.insert(newE.begin(), newE.end());
} }
void AddStats(const std::vector<Edges> &Es, const Unit &u, Stats &stats, size_t line) { int cost;
for(size_t i = 1; i <= n; ++i) {
std::map<size_t, Stats> statsPerAnnotator; for(size_t j = 1; j <= m; ++j) {
for(std::set<size_t>::iterator it = u.third.begin(); if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1]))
it != u.third.end(); ++it) { cost = 0;
statsPerAnnotator[*it] = Stats(4, 0); else
} cost = 2;
for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++) size_t left = d[i][j - 1] + 1;
statsPerAnnotator[it->annotator][2]++; size_t down = d[i - 1][j] + 1;
size_t diag = d[i - 1][j - 1] + cost;
for(std::set<size_t>::iterator ait = u.third.begin(); d[i][j] = std::min(left, std::min(down, diag));
ait != u.third.end(); ++ait) {
for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) { if(d[i][j] == left)
if(eit->edit.changed > 0) { bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1])));
statsPerAnnotator[*ait][1]++; if(d[i][j] == down)
Annot f; bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, "")));
f.i = eit->v.i; if(d[i][j] == diag)
f.j = eit->u.i; bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) ));
f.annotator = *ait; }
f.edit = eit->edit.edit; }
for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) { return d[n][m];
if(f == *fit) }
statsPerAnnotator[*ait][0]++;
}
} void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) {
Vertex start(bt.size() - 1, bt[0].size() - 1);
std::queue<Vertex> Q;
Q.push(start);
while(!Q.empty()) {
Vertex v = Q.front();
Q.pop();
if(V.count(v) > 0)
continue;
V.insert(v);
for(Track::iterator it = bt[v.i][v.j].begin();
it != bt[v.i][v.j].end(); ++it) {
Edge e(it->v, v, it->edit);
E.insert(e);
if(V.count(e.v) == 0)
Q.push(e.v);
}
}
Edges newE;
do {
newE.clear();
for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) {
if(it1->u == it2->v) {
Edge e = *it1 + *it2;
if(e.edit.changed > 0 &&
e.edit.unchanged <= m_max_unchanged &&
E.count(e) == 0)
newE.insert(e);
} }
} }
size_t bestAnnot = 0; }
float bestF = -1; E.insert(newE.begin(), newE.end());
for(std::set<size_t>::iterator it = u.third.begin(); } while(newE.size() > 0);
it != u.third.end(); ++it) { }
Stats localStats = stats;
localStats[0] += statsPerAnnotator[*it][0]; void AddWeights(Edges &E, const Unit &u, size_t aid) {
localStats[1] += statsPerAnnotator[*it][1]; for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
localStats[2] += statsPerAnnotator[*it][2]; if(it1->edit.changed > 0) {
if(m_verbose) const_cast<float&>(it1->edit.cost) += 0.001;
std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl; for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) {
float f = FScore(localStats); // if matches an annotator
if(m_verbose) if(it1->v.i == it2->i && it1->u.i == it2->j
std::cerr << f << std::endl; && Annot::transform(it1->edit.edit) == Annot::transform(it2->edit)
if(f > bestF) { && it2->annotator == aid) {
bestF = f; int newWeight = -(m_max_unchanged + 1) * E.size();
bestAnnot = *it; const_cast<float&>(it1->edit.cost) = newWeight;
} }
} }
if(m_verbose) }
std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl;
stats[0] += statsPerAnnotator[bestAnnot][0];
stats[1] += statsPerAnnotator[bestAnnot][1];
stats[2] += statsPerAnnotator[bestAnnot][2];
} }
}
void SufStats(const std::string &sStr, size_t i, Stats &stats) {
std::string temp = sStr;
Sentence s;
boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on);
Unit &unit = m_m2[i];
Matrix d;
TrackMatrix bt;
size_t distance = LevenshteinMatrix(unit.first, s, d, bt);
std::vector<Vertices> Vs(unit.third.size()); void BellmanFord(Vertices &V, Edges &E) {
std::vector<Edges> Es(unit.third.size()); Vertex source(0, 0);
std::map<Vertex, float> distance;
std::map<Vertex, Vertex> predecessor;
if(distance > unit.first.size()) { for(Vertices::iterator it = V.begin(); it != V.end(); ++it) {
std::cerr << "Levenshtein distance is greater than source size." << std::endl; if(*it == source)
stats[0] = 0; distance[*it] = 0;
stats[1] = distance; else {
stats[2] = 0; distance[*it] = std::numeric_limits<float>::infinity();
stats[3] = unit.first.size(); }
return;
}
else if(distance > 0) {
for(size_t j = 0; j < unit.third.size(); j++) {
BuildGraph(bt, Vs[j], Es[j]);
AddWeights(Es[j], unit, j);
BellmanFord(Vs[j], Es[j]);
}
}
AddStats(Es, unit, stats, i);
stats[3] = unit.first.size();
} }
for(size_t i = 1; i < V.size(); ++i) {
float FScore(const Stats& stats) { for(Edges::iterator it = E.begin(); it != E.end(); ++it) {
float p = 1.0; if(distance[it->v] + it->edit.cost < distance[it->u]) {
if(stats[1] != 0) distance[it->u] = distance[it->v] + it->edit.cost;
p = (float)stats[0] / (float)stats[1]; predecessor[it->u] = it->v;
}
float r = 1.0; }
if(stats[2] != 0)
r = (float)stats[0] / (float)stats[2];
float denom = (m_beta * m_beta * p + r);
float f = 0.0;
if(denom != 0)
f = ((1 + m_beta * m_beta) * p * r) / denom;
return f;
} }
void FScore(const Stats& stats, float &p, float &r, float &f) { Edges newE;
p = 1.0;
if(stats[1] != 0) Vertex v = *V.rbegin();
p = (float)stats[0] / (float)stats[1]; while(true) {
//std::cout << predecessor[v] << " -> " << v << std::endl;
r = 1.0; Edges::iterator it = E.find(Edge(predecessor[v], v));
if(stats[2] != 0) if(it != E.end()) {
r = (float)stats[0] / (float)stats[2]; Edge f = *it;
//std::cout << f << std::endl;
float denom = (m_beta * m_beta * p + r); newE.insert(f);
f = 0.0;
if(denom != 0) v = predecessor[v];
f = ((1 + m_beta * m_beta) * p * r) / denom; if(v == source)
break;
} else {
std::cout << "Error" << std::endl;
break;
}
} }
E.clear();
E.insert(newE.begin(), newE.end());
}
void AddStats(const std::vector<Edges> &Es, const Unit &u, Stats &stats, size_t line) {
std::map<size_t, Stats> statsPerAnnotator;
for(std::set<size_t>::iterator it = u.third.begin();
it != u.third.end(); ++it) {
statsPerAnnotator[*it] = Stats(4, 0);
}
for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++)
statsPerAnnotator[it->annotator][2]++;
for(std::set<size_t>::iterator ait = u.third.begin();
ait != u.third.end(); ++ait) {
for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) {
if(eit->edit.changed > 0) {
statsPerAnnotator[*ait][1]++;
Annot f;
f.i = eit->v.i;
f.j = eit->u.i;
f.annotator = *ait;
f.edit = eit->edit.edit;
for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) {
if(f == *fit)
statsPerAnnotator[*ait][0]++;
}
}
}
}
size_t bestAnnot = 0;
float bestF = -1;
for(std::set<size_t>::iterator it = u.third.begin();
it != u.third.end(); ++it) {
Stats localStats = stats;
localStats[0] += statsPerAnnotator[*it][0];
localStats[1] += statsPerAnnotator[*it][1];
localStats[2] += statsPerAnnotator[*it][2];
if(m_verbose)
std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl;
float f = FScore(localStats);
if(m_verbose)
std::cerr << f << std::endl;
if(f > bestF) {
bestF = f;
bestAnnot = *it;
}
}
if(m_verbose)
std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl;
stats[0] += statsPerAnnotator[bestAnnot][0];
stats[1] += statsPerAnnotator[bestAnnot][1];
stats[2] += statsPerAnnotator[bestAnnot][2];
}
void SufStats(const std::string &sStr, size_t i, Stats &stats) {
std::string temp = sStr;
Sentence s;
boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on);
Unit &unit = m_m2[i];
Matrix d;
TrackMatrix bt;
size_t distance = LevenshteinMatrix(unit.first, s, d, bt);
std::vector<Vertices> Vs(unit.third.size());
std::vector<Edges> Es(unit.third.size());
if(distance > unit.first.size()) {
std::cerr << "Levenshtein distance is greater than source size." << std::endl;
stats[0] = 0;
stats[1] = distance;
stats[2] = 0;
stats[3] = unit.first.size();
return;
} else if(distance > 0) {
for(size_t j = 0; j < unit.third.size(); j++) {
BuildGraph(bt, Vs[j], Es[j]);
AddWeights(Es[j], unit, j);
BellmanFord(Vs[j], Es[j]);
}
}
AddStats(Es, unit, stats, i);
stats[3] = unit.first.size();
}
float FScore(const Stats& stats) {
float p = 1.0;
if(stats[1] != 0)
p = (float)stats[0] / (float)stats[1];
float r = 1.0;
if(stats[2] != 0)
r = (float)stats[0] / (float)stats[2];
float denom = (m_beta * m_beta * p + r);
float f = 0.0;
if(denom != 0)
f = ((1 + m_beta * m_beta) * p * r) / denom;
return f;
}
void FScore(const Stats& stats, float &p, float &r, float &f) {
p = 1.0;
if(stats[1] != 0)
p = (float)stats[0] / (float)stats[1];
r = 1.0;
if(stats[2] != 0)
r = (float)stats[0] / (float)stats[2];
float denom = (m_beta * m_beta * p + r);
f = 0.0;
if(denom != 0)
f = ((1 + m_beta * m_beta) * p * r) / denom;
}
}; };
} }

View File

@ -45,90 +45,92 @@ float M2Scorer::calculateScore(const vector<ScoreStatsType>& comps) const
if (comps.size() != NumberOfScores()) { if (comps.size() != NumberOfScores()) {
throw runtime_error("Size of stat vector for M2Scorer is not " + NumberOfScores()); throw runtime_error("Size of stat vector for M2Scorer is not " + NumberOfScores());
} }
float beta = beta_; float beta = beta_;
float p = 0.0; float p = 0.0;
float r = 0.0; float r = 0.0;
float f = 0.0; float f = 0.0;
if(comps[1] != 0) if(comps[1] != 0)
p = comps[0] / (double)comps[1]; p = comps[0] / (double)comps[1];
else else
p = 1.0; p = 1.0;
if(comps[2] != 0) if(comps[2] != 0)
r = comps[0] / (double)comps[2]; r = comps[0] / (double)comps[2];
else else
r = 1.0; r = 1.0;
float denom = beta * beta * p + r; float denom = beta * beta * p + r;
if(denom != 0) if(denom != 0)
f = (1.0 + beta * beta) * p * r / denom; f = (1.0 + beta * beta) * p * r / denom;
else else
f = 0.0; f = 0.0;
if(verbose_) if(verbose_)
std::cerr << comps[0] << " " << comps[1] << " " << comps[2] << std::endl; std::cerr << comps[0] << " " << comps[1] << " " << comps[2] << std::endl;
if(verbose_) if(verbose_)
std::cerr << p << " " << r << " " << f << std::endl; std::cerr << p << " " << r << " " << f << std::endl;
return f; return f;
} }
float M2Scorer::getReferenceLength(const vector<ScoreStatsType>& comps) const { float M2Scorer::getReferenceLength(const vector<ScoreStatsType>& comps) const
{
return comps[3]; return comps[3];
} }
std::vector<ScoreStatsType> randomStats(float decay, int max) { std::vector<ScoreStatsType> randomStats(float decay, int max)
{
int gold = rand() % max; int gold = rand() % max;
int prop = rand() % max; int prop = rand() % max;
int corr = 0.0; int corr = 0.0;
if(std::min(prop, gold) > 0) if(std::min(prop, gold) > 0)
corr = rand() % std::min(prop, gold); corr = rand() % std::min(prop, gold);
//std::cerr << corr << " " << prop << " " << gold << std::endl; //std::cerr << corr << " " << prop << " " << gold << std::endl;
std::vector<ScoreStatsType> stats(3, 0.0); std::vector<ScoreStatsType> stats(3, 0.0);
stats[0] = corr * decay; stats[0] = corr * decay;
stats[1] = prop * decay; stats[1] = prop * decay;
stats[2] = gold * decay; stats[2] = gold * decay;
return stats; return stats;
} }
float sentenceM2(const std::vector<ScoreStatsType>& stats) float sentenceM2(const std::vector<ScoreStatsType>& stats)
{ {
float beta = 0.5; float beta = 0.5;
std::vector<ScoreStatsType> smoothStats(3, 0.0); // = randomStats(0.001, 5); std::vector<ScoreStatsType> smoothStats(3, 0.0); // = randomStats(0.001, 5);
smoothStats[0] += stats[0]; smoothStats[0] += stats[0];
smoothStats[1] += stats[1]; smoothStats[1] += stats[1];
smoothStats[2] += stats[2]; smoothStats[2] += stats[2];
float p = 0.0; float p = 0.0;
float r = 0.0; float r = 0.0;
float f = 0.0; float f = 0.0;
if(smoothStats[1] != 0) if(smoothStats[1] != 0)
p = smoothStats[0] / smoothStats[1]; p = smoothStats[0] / smoothStats[1];
else else
p = 1.0; p = 1.0;
if(smoothStats[2] != 0) if(smoothStats[2] != 0)
r = smoothStats[0] / smoothStats[2]; r = smoothStats[0] / smoothStats[2];
else else
r = 1.0; r = 1.0;
float denom = beta * beta * p + r; float denom = beta * beta * p + r;
if(denom != 0) if(denom != 0)
f = (1.0 + beta * beta) * p * r / denom; f = (1.0 + beta * beta) * p * r / denom;
else else
f = 0.0; f = 0.0;
return f; return f;
} }

View File

@ -31,15 +31,15 @@ public:
virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const; virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const;
virtual float getReferenceLength(const std::vector<ScoreStatsType>& comps) const; virtual float getReferenceLength(const std::vector<ScoreStatsType>& comps) const;
private: private:
float beta_; float beta_;
int max_unchanged_words_; int max_unchanged_words_;
bool truecase_; bool truecase_;
bool verbose_; bool verbose_;
M2::M2 m2_; M2::M2 m2_;
std::map<std::pair<size_t, std::string>, std::vector<ScoreStatsType> > seen_; std::map<std::pair<size_t, std::string>, std::vector<ScoreStatsType> > seen_;
// no copying allowed // no copying allowed
M2Scorer(const M2Scorer&); M2Scorer(const M2Scorer&);
M2Scorer& operator=(const M2Scorer&); M2Scorer& operator=(const M2Scorer&);

View File

@ -23,21 +23,22 @@ namespace Moses
using namespace std; using namespace std;
std::string MakePair(const std::string &s1, const std::string &s2, bool general) { std::string MakePair(const std::string &s1, const std::string &s2, bool general)
{
std::vector<std::string> sourceList; std::vector<std::string> sourceList;
std::vector<std::string> targetList; std::vector<std::string> targetList;
if(general) { if(general) {
Diffs diffs = CreateDiff(s1, s2); Diffs diffs = CreateDiff(s1, s2);
size_t i = 0, j = 0; size_t i = 0, j = 0;
char lastType = 'm'; char lastType = 'm';
std::string source, target; std::string source, target;
std::string match; std::string match;
int count = 1; int count = 1;
BOOST_FOREACH(Diff type, diffs) { BOOST_FOREACH(Diff type, diffs) {
if(type == 'm') { if(type == 'm') {
if(lastType != 'm') { if(lastType != 'm') {
@ -46,7 +47,7 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general)
} }
source.clear(); source.clear();
target.clear(); target.clear();
if(s1[i] == '+') { if(s1[i] == '+') {
if(match.size() >= 3) { if(match.size() >= 3) {
sourceList.push_back("(\\w{3,})·"); sourceList.push_back("(\\w{3,})·");
@ -54,56 +55,51 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general)
sprintf((char*)temp.c_str(), "%d", count); sprintf((char*)temp.c_str(), "%d", count);
targetList.push_back("\\" + temp + "·"); targetList.push_back("\\" + temp + "·");
count++; count++;
} } else {
else {
sourceList.push_back(match + "·"); sourceList.push_back(match + "·");
targetList.push_back(match + "·"); targetList.push_back(match + "·");
} }
match.clear(); match.clear();
} } else
else
match.push_back(s1[i]); match.push_back(s1[i]);
i++; i++;
j++; j++;
} } else if(type == 'd') {
else if(type == 'd') {
if(s1[i] == '+') if(s1[i] == '+')
source += "·"; source += "·";
else else
source.push_back(s1[i]); source.push_back(s1[i]);
i++; i++;
} } else if(type == 'i') {
else if(type == 'i') {
if(s2[j] == '+') if(s2[j] == '+')
target += "·"; target += "·";
else else
target.push_back(s2[j]); target.push_back(s2[j]);
j++; j++;
} }
if(type != 'm' && !match.empty()) { if(type != 'm' && !match.empty()) {
if(match.size() >= 3) { if(match.size() >= 3) {
sourceList.push_back("(\\w{3,})"); sourceList.push_back("(\\w{3,})");
std::string temp = "1"; std::string temp = "1";
sprintf((char*)temp.c_str(), "%d", count); sprintf((char*)temp.c_str(), "%d", count);
targetList.push_back("\\" + temp); targetList.push_back("\\" + temp);
count++; count++;
} } else {
else {
sourceList.push_back(match); sourceList.push_back(match);
targetList.push_back(match); targetList.push_back(match);
} }
match.clear(); match.clear();
} }
lastType = type; lastType = type;
} }
if(lastType != 'm') { if(lastType != 'm') {
sourceList.push_back(source); sourceList.push_back(source);
targetList.push_back(target); targetList.push_back(target);
} }
if(!match.empty()) { if(!match.empty()) {
if(match.size() >= 3) { if(match.size() >= 3) {
sourceList.push_back("(\\w{3,})"); sourceList.push_back("(\\w{3,})");
@ -111,45 +107,42 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general)
sprintf((char*)temp.c_str(), "%d", count); sprintf((char*)temp.c_str(), "%d", count);
targetList.push_back("\\"+ temp); targetList.push_back("\\"+ temp);
count++; count++;
} } else {
else {
sourceList.push_back(match); sourceList.push_back(match);
targetList.push_back(match); targetList.push_back(match);
} }
} }
match.clear(); match.clear();
} } else {
else {
std::string cs1 = s1; std::string cs1 = s1;
std::string cs2 = s2; std::string cs2 = s2;
boost::replace_all(cs1, "+", "·"); boost::replace_all(cs1, "+", "·");
boost::replace_all(cs2, "+", "·"); boost::replace_all(cs2, "+", "·");
sourceList.push_back(cs1); sourceList.push_back(cs1);
targetList.push_back(cs2); targetList.push_back(cs2);
} }
std::stringstream out; std::stringstream out;
out << "sub(«"; out << "sub(«";
out << boost::join(sourceList, ""); out << boost::join(sourceList, "");
out << "»,«"; out << "»,«";
out << boost::join(targetList, ""); out << boost::join(targetList, "");
out << "»)"; out << "»)";
return out.str(); return out.str();
} }
std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const { std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const
{
std::stringstream out; std::stringstream out;
if(s1.empty()) { if(s1.empty()) {
out << "ins(«" << boost::join(s2, "·") << "»)"; out << "ins(«" << boost::join(s2, "·") << "»)";
return out.str(); return out.str();
} } else if(s2.empty()) {
else if(s2.empty()) {
out << "del(«" << boost::join(s1, "·") << "»)"; out << "del(«" << boost::join(s1, "·") << "»)";
return out.str(); return out.str();
} } else {
else {
typename Tokens::value_type v1 = boost::join(s1, "+"); typename Tokens::value_type v1 = boost::join(s1, "+");
typename Tokens::value_type v2 = boost::join(s2, "+"); typename Tokens::value_type v2 = boost::join(s2, "+");
out << MakePair(v1, v2, m_general); out << MakePair(v1, v2, m_general);
@ -158,36 +151,36 @@ std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Token
} }
std::vector<std::string> GetContext(size_t pos, std::vector<std::string> GetContext(size_t pos,
size_t len, size_t len,
size_t window, size_t window,
const InputType &input, const InputType &input,
const InputPath &inputPath, const InputPath &inputPath,
const std::vector<FactorType>& factorTypes, const std::vector<FactorType>& factorTypes,
bool isRight) { bool isRight)
{
const Sentence& sentence = static_cast<const Sentence&>(input); const Sentence& sentence = static_cast<const Sentence&>(input);
const Range& range = inputPath.GetWordsRange(); const Range& range = inputPath.GetWordsRange();
int leftPos = range.GetStartPos() + pos - len - 1; int leftPos = range.GetStartPos() + pos - len - 1;
int rightPos = range.GetStartPos() + pos; int rightPos = range.GetStartPos() + pos;
std::vector<std::string> contexts; std::vector<std::string> contexts;
for(int length = 1; length <= (int)window; ++length) { for(int length = 1; length <= (int)window; ++length) {
std::vector<std::string> current; std::vector<std::string> current;
if(!isRight) { if(!isRight) {
for(int i = 0; i < length; i++) { for(int i = 0; i < length; i++) {
if(leftPos - i >= 0) { if(leftPos - i >= 0) {
current.push_back(sentence.GetWord(leftPos - i).GetString(factorTypes, false)); current.push_back(sentence.GetWord(leftPos - i).GetString(factorTypes, false));
} } else {
else {
current.push_back("<s>"); current.push_back("<s>");
} }
} }
if(current.back() == "<s>" && current.size() >= 2 && current[current.size()-2] == "<s>") if(current.back() == "<s>" && current.size() >= 2 && current[current.size()-2] == "<s>")
continue; continue;
std::reverse(current.begin(), current.end()); std::reverse(current.begin(), current.end());
contexts.push_back("left(«" + boost::join(current, "·") + "»)_"); contexts.push_back("left(«" + boost::join(current, "·") + "»)_");
} }
@ -195,8 +188,7 @@ std::vector<std::string> GetContext(size_t pos,
for(int i = 0; i < length; i++) { for(int i = 0; i < length; i++) {
if(rightPos + i < (int)sentence.GetSize()) { if(rightPos + i < (int)sentence.GetSize()) {
current.push_back(sentence.GetWord(rightPos + i).GetString(factorTypes, false)); current.push_back(sentence.GetWord(rightPos + i).GetString(factorTypes, false));
} } else {
else {
current.push_back("</s>"); current.push_back("</s>");
} }
} }
@ -206,7 +198,7 @@ std::vector<std::string> GetContext(size_t pos,
contexts.push_back("_right(«" + boost::join(current, "·") + "»)"); contexts.push_back("_right(«" + boost::join(current, "·") + "»)");
} }
} }
return contexts; return contexts;
} }
@ -214,8 +206,9 @@ std::vector<std::string>
CorrectionPattern::CreatePattern(const Tokens &s1, CorrectionPattern::CreatePattern(const Tokens &s1,
const Tokens &s2, const Tokens &s2,
const InputType &input, const InputType &input,
const InputPath &inputPath) const { const InputPath &inputPath) const
{
Diffs diffs = CreateDiff(s1, s2); Diffs diffs = CreateDiff(s1, s2);
size_t i = 0, j = 0; size_t i = 0, j = 0;
char lastType = 'm'; char lastType = 'm';
@ -226,20 +219,20 @@ CorrectionPattern::CreatePattern(const Tokens &s1,
if(lastType != 'm') { if(lastType != 'm') {
std::string pattern = CreateSinglePattern(source, target); std::string pattern = CreateSinglePattern(source, target);
patternList.push_back(pattern); patternList.push_back(pattern);
if(m_context > 0) { if(m_context > 0) {
std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false); std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false);
std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true); std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true);
BOOST_FOREACH(std::string left, leftContexts) BOOST_FOREACH(std::string left, leftContexts)
patternList.push_back(left + pattern); patternList.push_back(left + pattern);
BOOST_FOREACH(std::string right, rightContexts) BOOST_FOREACH(std::string right, rightContexts)
patternList.push_back(pattern + right); patternList.push_back(pattern + right);
BOOST_FOREACH(std::string left, leftContexts) BOOST_FOREACH(std::string left, leftContexts)
BOOST_FOREACH(std::string right, rightContexts) BOOST_FOREACH(std::string right, rightContexts)
patternList.push_back(left + pattern + right); patternList.push_back(left + pattern + right);
} }
} }
source.clear(); source.clear();
@ -250,12 +243,10 @@ CorrectionPattern::CreatePattern(const Tokens &s1,
} }
i++; i++;
j++; j++;
} } else if(type == 'd') {
else if(type == 'd') {
source.push_back(s1[i]); source.push_back(s1[i]);
i++; i++;
} } else if(type == 'i') {
else if(type == 'i') {
target.push_back(s2[j]); target.push_back(s2[j]);
j++; j++;
} }
@ -264,23 +255,23 @@ CorrectionPattern::CreatePattern(const Tokens &s1,
if(lastType != 'm') { if(lastType != 'm') {
std::string pattern = CreateSinglePattern(source, target); std::string pattern = CreateSinglePattern(source, target);
patternList.push_back(pattern); patternList.push_back(pattern);
if(m_context > 0) { if(m_context > 0) {
std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false); std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false);
std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true); std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true);
BOOST_FOREACH(std::string left, leftContexts) BOOST_FOREACH(std::string left, leftContexts)
patternList.push_back(left + pattern); patternList.push_back(left + pattern);
BOOST_FOREACH(std::string right, rightContexts) BOOST_FOREACH(std::string right, rightContexts)
patternList.push_back(pattern + right); patternList.push_back(pattern + right);
BOOST_FOREACH(std::string left, leftContexts) BOOST_FOREACH(std::string left, leftContexts)
BOOST_FOREACH(std::string right, rightContexts) BOOST_FOREACH(std::string right, rightContexts)
patternList.push_back(left + pattern + right); patternList.push_back(left + pattern + right);
} }
} }
return patternList; return patternList;
} }
@ -308,36 +299,36 @@ void CorrectionPattern::SetParameter(const std::string& key, const std::string&
} }
void CorrectionPattern::EvaluateWithSourceContext(const InputType &input void CorrectionPattern::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath , const InputPath &inputPath
, const TargetPhrase &targetPhrase , const TargetPhrase &targetPhrase
, const StackVec *stackVec , const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore) const , ScoreComponentCollection *estimatedFutureScore) const
{ {
ComputeFeatures(input, inputPath, targetPhrase, &scoreBreakdown); ComputeFeatures(input, inputPath, targetPhrase, &scoreBreakdown);
} }
void CorrectionPattern::ComputeFeatures( void CorrectionPattern::ComputeFeatures(
const InputType &input, const InputType &input,
const InputPath &inputPath, const InputPath &inputPath,
const TargetPhrase& target, const TargetPhrase& target,
ScoreComponentCollection* accumulator) const ScoreComponentCollection* accumulator) const
{ {
const Phrase &source = inputPath.GetPhrase(); const Phrase &source = inputPath.GetPhrase();
std::vector<std::string> sourceTokens; std::vector<std::string> sourceTokens;
for(size_t i = 0; i < source.GetSize(); ++i) for(size_t i = 0; i < source.GetSize(); ++i)
sourceTokens.push_back(source.GetWord(i).GetString(m_factors, false)); sourceTokens.push_back(source.GetWord(i).GetString(m_factors, false));
std::vector<std::string> targetTokens; std::vector<std::string> targetTokens;
for(size_t i = 0; i < target.GetSize(); ++i) for(size_t i = 0; i < target.GetSize(); ++i)
targetTokens.push_back(target.GetWord(i).GetString(m_factors, false)); targetTokens.push_back(target.GetWord(i).GetString(m_factors, false));
std::vector<std::string> patternList = CreatePattern(sourceTokens, targetTokens, input, inputPath); std::vector<std::string> patternList = CreatePattern(sourceTokens, targetTokens, input, inputPath);
for(size_t i = 0; i < patternList.size(); ++i) for(size_t i = 0; i < patternList.size(); ++i)
accumulator->PlusEquals(this, patternList[i], 1); accumulator->PlusEquals(this, patternList[i], 1);
/* /*
BOOST_FOREACH(std::string w, sourceTokens) BOOST_FOREACH(std::string w, sourceTokens)
std::cerr << w << " "; std::cerr << w << " ";
std::cerr << std::endl; std::cerr << std::endl;

View File

@ -29,41 +29,41 @@ public:
bool IsUseable(const FactorMask &mask) const; bool IsUseable(const FactorMask &mask) const;
void EvaluateInIsolation(const Phrase &source void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase , const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const , ScoreComponentCollection &estimatedFutureScore) const
{}
virtual void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{} {}
virtual void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{}
void EvaluateWhenApplied(const Hypothesis& hypo, void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const ScoreComponentCollection* accumulator) const
{} {}
void EvaluateWhenApplied(const ChartHypothesis &hypo, void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const ScoreComponentCollection* accumulator) const
{} {}
void ComputeFeatures(const InputType &input, void ComputeFeatures(const InputType &input,
const InputPath &inputPath, const InputPath &inputPath,
const TargetPhrase& targetPhrase, const TargetPhrase& targetPhrase,
ScoreComponentCollection* accumulator) const; ScoreComponentCollection* accumulator) const;
void SetParameter(const std::string& key, const std::string& value); void SetParameter(const std::string& key, const std::string& value);
std::vector<std::string> CreatePattern(const Tokens &s1, std::vector<std::string> CreatePattern(const Tokens &s1,
const Tokens &s2, const Tokens &s2,
const InputType &input, const InputType &input,
const InputPath &inputPath) const; const InputPath &inputPath) const;
std::string CreateSinglePattern(const Tokens &s1, const Tokens &s2) const; std::string CreateSinglePattern(const Tokens &s1, const Tokens &s2) const;
}; };

View File

@ -11,22 +11,21 @@ typedef std::vector<Diff> Diffs;
template <class Sequence, class Pred> template <class Sequence, class Pred>
void CreateDiffRec(size_t** c, void CreateDiffRec(size_t** c,
const Sequence &s1, const Sequence &s1,
const Sequence &s2, const Sequence &s2,
size_t start, size_t start,
size_t i, size_t i,
size_t j, size_t j,
Diffs& diffs, Diffs& diffs,
Pred pred) { Pred pred)
{
if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) { if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) {
CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred); CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred);
diffs.push_back(Diff('m')); diffs.push_back(Diff('m'));
} } else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) {
else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) {
CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred); CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred);
diffs.push_back(Diff('i')); diffs.push_back(Diff('i'));
} } else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) {
else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) {
CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred); CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred);
diffs.push_back(Diff('d')); diffs.push_back(Diff('d'));
} }
@ -34,17 +33,18 @@ void CreateDiffRec(size_t** c,
template <class Sequence, class Pred> template <class Sequence, class Pred>
Diffs CreateDiff(const Sequence& s1, Diffs CreateDiff(const Sequence& s1,
const Sequence& s2, const Sequence& s2,
Pred pred) { Pred pred)
{
Diffs diffs; Diffs diffs;
size_t n = s2.size(); size_t n = s2.size();
int start = 0; int start = 0;
int m_end = s1.size() - 1; int m_end = s1.size() - 1;
int n_end = s2.size() - 1; int n_end = s2.size() - 1;
while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) { while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) {
diffs.push_back(Diff('m')); diffs.push_back(Diff('m'));
start++; start++;
@ -53,49 +53,51 @@ Diffs CreateDiff(const Sequence& s1,
m_end--; m_end--;
n_end--; n_end--;
} }
size_t m_new = m_end - start + 1; size_t m_new = m_end - start + 1;
size_t n_new = n_end - start + 1; size_t n_new = n_end - start + 1;
size_t** c = new size_t*[m_new + 1]; size_t** c = new size_t*[m_new + 1];
for(size_t i = 0; i <= m_new; ++i) { for(size_t i = 0; i <= m_new; ++i) {
c[i] = new size_t[n_new + 1]; c[i] = new size_t[n_new + 1];
c[i][0] = 0; c[i][0] = 0;
} }
for(size_t j = 0; j <= n_new; ++j) for(size_t j = 0; j <= n_new; ++j)
c[0][j] = 0; c[0][j] = 0;
for(size_t i = 1; i <= m_new; ++i) for(size_t i = 1; i <= m_new; ++i)
for(size_t j = 1; j <= n_new; ++j) for(size_t j = 1; j <= n_new; ++j)
if(pred(s1[i - 1 + start], s2[j - 1 + start])) if(pred(s1[i - 1 + start], s2[j - 1 + start]))
c[i][j] = c[i-1][j-1] + 1; c[i][j] = c[i-1][j-1] + 1;
else else
c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j]; c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j];
CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred); CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred);
for(size_t i = 0; i <= m_new; ++i) for(size_t i = 0; i <= m_new; ++i)
delete[] c[i]; delete[] c[i];
delete[] c; delete[] c;
for (size_t i = n_end + 1; i < n; ++i) for (size_t i = n_end + 1; i < n; ++i)
diffs.push_back(Diff('m')); diffs.push_back(Diff('m'));
return diffs; return diffs;
} }
template <class Sequence> template <class Sequence>
Diffs CreateDiff(const Sequence& s1, const Sequence& s2) { Diffs CreateDiff(const Sequence& s1, const Sequence& s2)
{
return CreateDiff(s1, s2, std::equal_to<typename Sequence::value_type>()); return CreateDiff(s1, s2, std::equal_to<typename Sequence::value_type>());
} }
template <class Sequence, class Sig, class Stats> template <class Sequence, class Sig, class Stats>
void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats) { void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats)
{
if(sig.size() != stats.size()) if(sig.size() != stats.size())
throw "Signature size differs from score array size."; throw "Signature size differs from score array size.";
size_t m = 0, d = 0, i = 0, s = 0; size_t m = 0, d = 0, i = 0, s = 0;
Diffs diff = CreateDiff(s1, s2); Diffs diff = CreateDiff(s1, s2);
for(int j = 0; j < (int)diff.size(); ++j) { for(int j = 0; j < (int)diff.size(); ++j) {
if(diff[j] == 'm') if(diff[j] == 'm')
m++; m++;
@ -109,27 +111,36 @@ void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& sta
k++; k++;
} }
j += k; j += k;
} } else if(diff[j] == 'i')
else if(diff[j] == 'i')
i++; i++;
} }
for(size_t j = 0; j < sig.size(); ++j) { for(size_t j = 0; j < sig.size(); ++j) {
switch (sig[j]) { switch (sig[j]) {
case 'l': stats[j] += d + i + s; break; case 'l':
case 'm': stats[j] += m; break; stats[j] += d + i + s;
case 'd': stats[j] += d; break; break;
case 'i': stats[j] += i; break; case 'm':
case 's': stats[j] += s; break; stats[j] += m;
case 'r': break;
float macc = 1; case 'd':
if (d + i + s + m) stats[j] += d;
macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m); break;
if(macc > 0) case 'i':
stats[j] += log(macc); stats[j] += i;
else break;
stats[j] += log(1.0/(float)(d + i + s + m + 1)); case 's':
break; stats[j] += s;
break;
case 'r':
float macc = 1;
if (d + i + s + m)
macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m);
if(macc > 0)
stats[j] += log(macc);
else
stats[j] += log(1.0/(float)(d + i + s + m + 1));
break;
} }
} }
} }

View File

@ -21,14 +21,15 @@ namespace Moses
using namespace std; using namespace std;
std::string ParseScores(const std::string &line, const std::string& defaultScores) { std::string ParseScores(const std::string &line, const std::string& defaultScores)
{
std::vector<std::string> toks = Tokenize(line); std::vector<std::string> toks = Tokenize(line);
UTIL_THROW_IF2(toks.empty(), "Empty line"); UTIL_THROW_IF2(toks.empty(), "Empty line");
for (size_t i = 1; i < toks.size(); ++i) { for (size_t i = 1; i < toks.size(); ++i) {
std::vector<std::string> args = TokenizeFirstOnly(toks[i], "="); std::vector<std::string> args = TokenizeFirstOnly(toks[i], "=");
UTIL_THROW_IF2(args.size() != 2, UTIL_THROW_IF2(args.size() != 2,
"Incorrect format for feature function arg: " << toks[i]); "Incorrect format for feature function arg: " << toks[i]);
if (args[0] == "scores") { if (args[0] == "scores") {
return args[1]; return args[1];
@ -62,30 +63,29 @@ void EditOps::Load()
{ } { }
void EditOps::EvaluateInIsolation(const Phrase &source void EditOps::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &target , const TargetPhrase &target
, ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const , ScoreComponentCollection &estimatedFutureScore) const
{ {
ComputeFeatures(source, target, &scoreBreakdown); ComputeFeatures(source, target, &scoreBreakdown);
} }
void EditOps::ComputeFeatures( void EditOps::ComputeFeatures(
const Phrase &source, const Phrase &source,
const TargetPhrase& target, const TargetPhrase& target,
ScoreComponentCollection* accumulator) const ScoreComponentCollection* accumulator) const
{ {
std::vector<float> ops(GetNumScoreComponents(), 0); std::vector<float> ops(GetNumScoreComponents(), 0);
if(m_chars) { if(m_chars) {
std::vector<FactorType> factors; std::vector<FactorType> factors;
factors.push_back(m_factorType); factors.push_back(m_factorType);
std::string sourceStr = source.GetStringRep(factors); std::string sourceStr = source.GetStringRep(factors);
std::string targetStr = target.GetStringRep(factors); std::string targetStr = target.GetStringRep(factors);
AddStats(sourceStr, targetStr, m_scores, ops); AddStats(sourceStr, targetStr, m_scores, ops);
} } else {
else {
std::vector<std::string> sourceTokens; std::vector<std::string> sourceTokens;
//std::cerr << "Ed src: "; //std::cerr << "Ed src: ";
for(size_t i = 0; i < source.GetSize(); ++i) { for(size_t i = 0; i < source.GetSize(); ++i) {
@ -94,7 +94,7 @@ void EditOps::ComputeFeatures(
//std::cerr << sourceTokens.back() << " "; //std::cerr << sourceTokens.back() << " ";
} }
//std::cerr << std::endl; //std::cerr << std::endl;
std::vector<std::string> targetTokens; std::vector<std::string> targetTokens;
//std::cerr << "Ed trg: "; //std::cerr << "Ed trg: ";
for(size_t i = 0; i < target.GetSize(); ++i) { for(size_t i = 0; i < target.GetSize(); ++i) {
@ -103,10 +103,10 @@ void EditOps::ComputeFeatures(
//std::cerr << targetTokens.back() << " "; //std::cerr << targetTokens.back() << " ";
} }
//std::cerr << std::endl; //std::cerr << std::endl;
AddStats(sourceTokens, targetTokens, m_scores, ops); AddStats(sourceTokens, targetTokens, m_scores, ops);
} }
accumulator->PlusEquals(this, ops); accumulator->PlusEquals(this, ops);
} }

View File

@ -32,26 +32,26 @@ public:
void Load(); void Load();
virtual void EvaluateInIsolation(const Phrase &source virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase , const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const; , ScoreComponentCollection &estimatedFutureScore) const;
void EvaluateWithSourceContext(const InputType &input void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath , const InputPath &inputPath
, const TargetPhrase &targetPhrase , const TargetPhrase &targetPhrase
, const StackVec *stackVec , const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const , ScoreComponentCollection *estimatedFutureScore = NULL) const
{} {}
void EvaluateWhenApplied(const Hypothesis& hypo, void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const ScoreComponentCollection* accumulator) const
{} {}
void EvaluateWhenApplied(const ChartHypothesis &hypo, void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const ScoreComponentCollection* accumulator) const
{}
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{} {}
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{}
void ComputeFeatures(const Phrase &source, void ComputeFeatures(const Phrase &source,
const TargetPhrase& targetPhrase, const TargetPhrase& targetPhrase,