diff --git a/mert/M2.cpp b/mert/M2.cpp index 45e223f1a..58181d38e 100644 --- a/mert/M2.cpp +++ b/mert/M2.cpp @@ -3,21 +3,23 @@ #include "M2.h" -namespace MosesTuning { +namespace MosesTuning +{ -namespace M2 { +namespace M2 +{ bool Annot::lowercase = true; - std::string Annot::transform(const std::string& e) { - std::string temp = e; - if(lowercase) { - boost::erase_all(temp, " "); - return ToLower(temp); - } - else - return e; - } +std::string Annot::transform(const std::string& e) +{ + std::string temp = e; + if(lowercase) { + boost::erase_all(temp, " "); + return ToLower(temp); + } else + return e; +} const std::string ToLower(const std::string& str) { @@ -27,27 +29,30 @@ const std::string ToLower(const std::string& str) } -Edit operator+(Edit& e1, Edit& e2) { - std::string edit; - if(e1.edit.size() > 0 && e2.edit.size() > 0) - edit = e1.edit + " " + e2.edit; - else if(e1.edit.size() > 0) - edit = e1.edit; - else if(e2.edit.size() > 0) - edit = e2.edit; - - return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit); +Edit operator+(Edit& e1, Edit& e2) +{ + std::string edit; + if(e1.edit.size() > 0 && e2.edit.size() > 0) + edit = e1.edit + " " + e2.edit; + else if(e1.edit.size() > 0) + edit = e1.edit; + else if(e2.edit.size() > 0) + edit = e2.edit; + + return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit); } -Edge operator+(Edge e1, Edge e2) { - return Edge(e1.v, e2.u, e1.edit + e2.edit); +Edge operator+(Edge e1, Edge e2) +{ + return Edge(e1.v, e2.u, e1.edit + e2.edit); } -std::ostream& operator<<(std::ostream& o, Sentence s) { - for(Sentence::iterator it = s.begin(); it != s.end(); it++) - o << *it << " "; - return o; +std::ostream& operator<<(std::ostream& o, Sentence s) +{ + for(Sentence::iterator it = s.begin(); it != s.end(); it++) + o << *it << " "; + return o; } diff --git a/mert/M2.h b/mert/M2.h index 59d3427af..76f1aed6e 100644 --- a/mert/M2.h +++ b/mert/M2.h @@ -16,9 +16,11 @@ -namespace MosesTuning { +namespace MosesTuning +{ -namespace M2 { +namespace M2 +{ typedef std::vector Stats; @@ -29,44 +31,44 @@ std::ostream& operator<<(std::ostream& o, Sentence s); const std::string ToLower(const std::string& str); struct Annot { - size_t i; - size_t j; - - std::string type; - std::string edit; - - size_t annotator; - - bool operator<(Annot a) const { - return i < a.i || (i == a.i && j < a.j) - || (i == a.i && j == a.j && annotator < a.annotator) - || (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit)); - } + size_t i; + size_t j; - bool operator==(Annot a) const { - return (!(*this < a) && !(a < *this)); - } + std::string type; + std::string edit; - static std::string transform(const std::string& e); + size_t annotator; - static bool lowercase; + bool operator<(Annot a) const { + return i < a.i || (i == a.i && j < a.j) + || (i == a.i && j == a.j && annotator < a.annotator) + || (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit)); + } + + bool operator==(Annot a) const { + return (!(*this < a) && !(a < *this)); + } + + static std::string transform(const std::string& e); + + static bool lowercase; }; typedef std::set Annots; typedef std::set Users; struct Unit { - Sentence first; - Annots second; - Users third; + Sentence first; + Annots second; + Users third; }; typedef std::vector M2File; struct Edit { Edit(float c = 1.0, size_t ch = 0, size_t unch = 1, std::string e = "") - : cost(c), changed(ch), unchanged(unch), edit(e) {} - + : cost(c), changed(ch), unchanged(unch), edit(e) {} + float cost; size_t changed; size_t unchanged; @@ -77,7 +79,7 @@ Edit operator+(Edit& e1, Edit& e2); struct Vertex { Vertex(size_t a = 0, size_t b = 0) : i(a), j(b) {} - + bool operator<(const Vertex &v) const { return i < v.i || (i == v.i && j < v.j); } @@ -85,19 +87,19 @@ struct Vertex { bool operator==(const Vertex &v) const { return i == v.i && j == v.j; } - + size_t i; size_t j; }; struct Edge { Edge(Vertex vv = Vertex(), Vertex uu = Vertex(), Edit editt = Edit()) - : v(vv), u(uu), edit(editt) {} - + : v(vv), u(uu), edit(editt) {} + bool operator<(const Edge &e) const { return v < e.v || (v == e.v && u < e.u); } - + Vertex v; Vertex u; Edit edit; @@ -110,7 +112,7 @@ typedef std::vector Matrix; struct Info { Info(Vertex vv = Vertex(), Edit editt = Edit()) - : v(vv), edit(editt) {} + : v(vv), edit(editt) {} bool operator<(const Info &i) const { return v < i.v; @@ -127,352 +129,350 @@ typedef std::vector TrackMatrix; typedef std::set Vertices; typedef std::set Edges; -class M2 { - private: - M2File m_m2; - - size_t m_max_unchanged; - float m_beta; - bool m_lowercase; - bool m_verbose; - - public: - M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { } - M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false) +class M2 +{ +private: + M2File m_m2; + + size_t m_max_unchanged; + float m_beta; + bool m_lowercase; + bool m_verbose; + +public: + M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { } + M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false) : m_max_unchanged(max_unchanged), m_beta(beta), m_lowercase(!truecase), m_verbose(verbose) { - if(!m_lowercase) { - Annot::lowercase = false; - } + if(!m_lowercase) { + Annot::lowercase = false; } - - float Beta() { - return m_beta; - } - - void ReadM2(const std::string& filename) { - std::ifstream m2file(filename.c_str()); - std::string line; - - Unit unit; - bool first = true; - - while(std::getline(m2file, line)) { - if(line.size() > 2) { - if(line.substr(0, 2) == "S ") { - if(!first) { - if(unit.third.empty()) - unit.third.insert(0); - m_m2.push_back(unit); - } - first = false; - - unit.first = Sentence(); - unit.second = Annots(); - - std::string sentenceLine = line.substr(2); - boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on); - } - if(line.substr(0, 2) == "A ") { - std::string annotLine = line.substr(2); - - std::vector annot; - boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||")); - - if(annot[1] != "noop") { - Annot a; - std::stringstream rangeStr(annot[0]); - rangeStr >> a.i >> a.j; - a.type = annot[1]; - a.edit = annot[2]; - - std::stringstream annotStr(annot[5]); - annotStr >> a.annotator; - - unit.third.insert(a.annotator); - unit.second.insert(a); - } - else { - std::stringstream annotStr(annot[5]); - size_t annotator; - annotStr >> annotator; - unit.third.insert(annotator); - } - } - } + } + + float Beta() { + return m_beta; + } + + void ReadM2(const std::string& filename) { + std::ifstream m2file(filename.c_str()); + std::string line; + + Unit unit; + bool first = true; + + while(std::getline(m2file, line)) { + if(line.size() > 2) { + if(line.substr(0, 2) == "S ") { + if(!first) { + if(unit.third.empty()) + unit.third.insert(0); + m_m2.push_back(unit); + } + first = false; + + unit.first = Sentence(); + unit.second = Annots(); + + std::string sentenceLine = line.substr(2); + boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on); } - if(unit.third.empty()) - unit.third.insert(0); - m_m2.push_back(unit); - } - - size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) { - size_t n = s1.size(); - size_t m = s2.size(); - - if (n == 0) - return m; - if (m == 0) - return n; - - d.resize(n + 1, Row(m + 1, 0)); - bt.resize(n + 1, TrackRow(m + 1)); - - for(size_t i = 0; i <= n; ++i) { - d[i][0] = i; - if(i > 0) - bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, ""))); - } - for(size_t j = 0; j <= m; ++j) { - d[0][j] = j; - if(j > 0) - bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1]))); - } - - int cost; - for(size_t i = 1; i <= n; ++i) { - for(size_t j = 1; j <= m; ++j) { - if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1])) - cost = 0; - else - cost = 2; - - size_t left = d[i][j - 1] + 1; - size_t down = d[i - 1][j] + 1; - size_t diag = d[i - 1][j - 1] + cost; - - d[i][j] = std::min(left, std::min(down, diag)); - - if(d[i][j] == left) - bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1]))); - if(d[i][j] == down) - bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, ""))); - if(d[i][j] == diag) - bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) )); + if(line.substr(0, 2) == "A ") { + std::string annotLine = line.substr(2); + + std::vector annot; + boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||")); + + if(annot[1] != "noop") { + Annot a; + std::stringstream rangeStr(annot[0]); + rangeStr >> a.i >> a.j; + a.type = annot[1]; + a.edit = annot[2]; + + std::stringstream annotStr(annot[5]); + annotStr >> a.annotator; + + unit.third.insert(a.annotator); + unit.second.insert(a); + } else { + std::stringstream annotStr(annot[5]); + size_t annotator; + annotStr >> annotator; + unit.third.insert(annotator); } } - return d[n][m]; + } } - - - void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) { - Vertex start(bt.size() - 1, bt[0].size() - 1); - - std::queue Q; - Q.push(start); - while(!Q.empty()) { - Vertex v = Q.front(); - Q.pop(); - if(V.count(v) > 0) - continue; - V.insert(v); - for(Track::iterator it = bt[v.i][v.j].begin(); - it != bt[v.i][v.j].end(); ++it) { - Edge e(it->v, v, it->edit); - E.insert(e); - if(V.count(e.v) == 0) - Q.push(e.v); - } - } - - Edges newE; - do { - newE.clear(); - for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { - for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) { - if(it1->u == it2->v) { - Edge e = *it1 + *it2; - if(e.edit.changed > 0 && - e.edit.unchanged <= m_max_unchanged && - E.count(e) == 0) - newE.insert(e); - } - } - } - E.insert(newE.begin(), newE.end()); - } while(newE.size() > 0); + if(unit.third.empty()) + unit.third.insert(0); + m_m2.push_back(unit); + } + + size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) { + size_t n = s1.size(); + size_t m = s2.size(); + + if (n == 0) + return m; + if (m == 0) + return n; + + d.resize(n + 1, Row(m + 1, 0)); + bt.resize(n + 1, TrackRow(m + 1)); + + for(size_t i = 0; i <= n; ++i) { + d[i][0] = i; + if(i > 0) + bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, ""))); } - - void AddWeights(Edges &E, const Unit &u, size_t aid) { - for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { - if(it1->edit.changed > 0) { - const_cast(it1->edit.cost) += 0.001; - for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) { - // if matches an annotator - if(it1->v.i == it2->i && it1->u.i == it2->j - && Annot::transform(it1->edit.edit) == Annot::transform(it2->edit) - && it2->annotator == aid) { - int newWeight = -(m_max_unchanged + 1) * E.size(); - const_cast(it1->edit.cost) = newWeight; - } - } - } - } - } - - void BellmanFord(Vertices &V, Edges &E) { - Vertex source(0, 0); - std::map distance; - std::map predecessor; - - for(Vertices::iterator it = V.begin(); it != V.end(); ++it) { - if(*it == source) - distance[*it] = 0; - else { - distance[*it] = std::numeric_limits::infinity(); - } - } - - for(size_t i = 1; i < V.size(); ++i) { - for(Edges::iterator it = E.begin(); it != E.end(); ++it) { - if(distance[it->v] + it->edit.cost < distance[it->u]) { - distance[it->u] = distance[it->v] + it->edit.cost; - predecessor[it->u] = it->v; - } - } - } - - Edges newE; - - Vertex v = *V.rbegin(); - while(true) { - //std::cout << predecessor[v] << " -> " << v << std::endl; - Edges::iterator it = E.find(Edge(predecessor[v], v)); - if(it != E.end()) { - Edge f = *it; - //std::cout << f << std::endl; - newE.insert(f); - - v = predecessor[v]; - if(v == source) - break; - } - else { - std::cout << "Error" << std::endl; - break; - } - } - E.clear(); - E.insert(newE.begin(), newE.end()); + for(size_t j = 0; j <= m; ++j) { + d[0][j] = j; + if(j > 0) + bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1]))); } - void AddStats(const std::vector &Es, const Unit &u, Stats &stats, size_t line) { - - std::map statsPerAnnotator; - for(std::set::iterator it = u.third.begin(); - it != u.third.end(); ++it) { - statsPerAnnotator[*it] = Stats(4, 0); - } + int cost; + for(size_t i = 1; i <= n; ++i) { + for(size_t j = 1; j <= m; ++j) { + if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1])) + cost = 0; + else + cost = 2; - for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++) - statsPerAnnotator[it->annotator][2]++; + size_t left = d[i][j - 1] + 1; + size_t down = d[i - 1][j] + 1; + size_t diag = d[i - 1][j - 1] + cost; - for(std::set::iterator ait = u.third.begin(); - ait != u.third.end(); ++ait) { - for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) { - if(eit->edit.changed > 0) { - statsPerAnnotator[*ait][1]++; - Annot f; - f.i = eit->v.i; - f.j = eit->u.i; - f.annotator = *ait; - f.edit = eit->edit.edit; - for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) { - if(f == *fit) - statsPerAnnotator[*ait][0]++; - } - } + d[i][j] = std::min(left, std::min(down, diag)); + + if(d[i][j] == left) + bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1]))); + if(d[i][j] == down) + bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, ""))); + if(d[i][j] == diag) + bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) )); + } + } + return d[n][m]; + } + + + void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) { + Vertex start(bt.size() - 1, bt[0].size() - 1); + + std::queue Q; + Q.push(start); + while(!Q.empty()) { + Vertex v = Q.front(); + Q.pop(); + if(V.count(v) > 0) + continue; + V.insert(v); + for(Track::iterator it = bt[v.i][v.j].begin(); + it != bt[v.i][v.j].end(); ++it) { + Edge e(it->v, v, it->edit); + E.insert(e); + if(V.count(e.v) == 0) + Q.push(e.v); + } + } + + Edges newE; + do { + newE.clear(); + for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { + for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) { + if(it1->u == it2->v) { + Edge e = *it1 + *it2; + if(e.edit.changed > 0 && + e.edit.unchanged <= m_max_unchanged && + E.count(e) == 0) + newE.insert(e); } } - size_t bestAnnot = 0; - float bestF = -1; - for(std::set::iterator it = u.third.begin(); - it != u.third.end(); ++it) { - Stats localStats = stats; - localStats[0] += statsPerAnnotator[*it][0]; - localStats[1] += statsPerAnnotator[*it][1]; - localStats[2] += statsPerAnnotator[*it][2]; - if(m_verbose) - std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl; - float f = FScore(localStats); - if(m_verbose) - std::cerr << f << std::endl; - if(f > bestF) { - bestF = f; - bestAnnot = *it; - } + } + E.insert(newE.begin(), newE.end()); + } while(newE.size() > 0); + } + + void AddWeights(Edges &E, const Unit &u, size_t aid) { + for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { + if(it1->edit.changed > 0) { + const_cast(it1->edit.cost) += 0.001; + for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) { + // if matches an annotator + if(it1->v.i == it2->i && it1->u.i == it2->j + && Annot::transform(it1->edit.edit) == Annot::transform(it2->edit) + && it2->annotator == aid) { + int newWeight = -(m_max_unchanged + 1) * E.size(); + const_cast(it1->edit.cost) = newWeight; + } } - if(m_verbose) - std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl; - stats[0] += statsPerAnnotator[bestAnnot][0]; - stats[1] += statsPerAnnotator[bestAnnot][1]; - stats[2] += statsPerAnnotator[bestAnnot][2]; + } } - - void SufStats(const std::string &sStr, size_t i, Stats &stats) { - std::string temp = sStr; - - Sentence s; - boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on); - - Unit &unit = m_m2[i]; - - Matrix d; - TrackMatrix bt; - size_t distance = LevenshteinMatrix(unit.first, s, d, bt); + } - std::vector Vs(unit.third.size()); - std::vector Es(unit.third.size()); + void BellmanFord(Vertices &V, Edges &E) { + Vertex source(0, 0); + std::map distance; + std::map predecessor; - if(distance > unit.first.size()) { - std::cerr << "Levenshtein distance is greater than source size." << std::endl; - stats[0] = 0; - stats[1] = distance; - stats[2] = 0; - stats[3] = unit.first.size(); - return; - } - else if(distance > 0) { - for(size_t j = 0; j < unit.third.size(); j++) { - BuildGraph(bt, Vs[j], Es[j]); - AddWeights(Es[j], unit, j); - BellmanFord(Vs[j], Es[j]); - } - } - AddStats(Es, unit, stats, i); - stats[3] = unit.first.size(); + for(Vertices::iterator it = V.begin(); it != V.end(); ++it) { + if(*it == source) + distance[*it] = 0; + else { + distance[*it] = std::numeric_limits::infinity(); + } } - - float FScore(const Stats& stats) { - float p = 1.0; - if(stats[1] != 0) - p = (float)stats[0] / (float)stats[1]; - - float r = 1.0; - if(stats[2] != 0) - r = (float)stats[0] / (float)stats[2]; - - float denom = (m_beta * m_beta * p + r); - float f = 0.0; - if(denom != 0) - f = ((1 + m_beta * m_beta) * p * r) / denom; - return f; + for(size_t i = 1; i < V.size(); ++i) { + for(Edges::iterator it = E.begin(); it != E.end(); ++it) { + if(distance[it->v] + it->edit.cost < distance[it->u]) { + distance[it->u] = distance[it->v] + it->edit.cost; + predecessor[it->u] = it->v; + } + } } - - void FScore(const Stats& stats, float &p, float &r, float &f) { - p = 1.0; - if(stats[1] != 0) - p = (float)stats[0] / (float)stats[1]; - - r = 1.0; - if(stats[2] != 0) - r = (float)stats[0] / (float)stats[2]; - - float denom = (m_beta * m_beta * p + r); - f = 0.0; - if(denom != 0) - f = ((1 + m_beta * m_beta) * p * r) / denom; + + Edges newE; + + Vertex v = *V.rbegin(); + while(true) { + //std::cout << predecessor[v] << " -> " << v << std::endl; + Edges::iterator it = E.find(Edge(predecessor[v], v)); + if(it != E.end()) { + Edge f = *it; + //std::cout << f << std::endl; + newE.insert(f); + + v = predecessor[v]; + if(v == source) + break; + } else { + std::cout << "Error" << std::endl; + break; + } } + E.clear(); + E.insert(newE.begin(), newE.end()); + } + + void AddStats(const std::vector &Es, const Unit &u, Stats &stats, size_t line) { + + std::map statsPerAnnotator; + for(std::set::iterator it = u.third.begin(); + it != u.third.end(); ++it) { + statsPerAnnotator[*it] = Stats(4, 0); + } + + for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++) + statsPerAnnotator[it->annotator][2]++; + + for(std::set::iterator ait = u.third.begin(); + ait != u.third.end(); ++ait) { + for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) { + if(eit->edit.changed > 0) { + statsPerAnnotator[*ait][1]++; + Annot f; + f.i = eit->v.i; + f.j = eit->u.i; + f.annotator = *ait; + f.edit = eit->edit.edit; + for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) { + if(f == *fit) + statsPerAnnotator[*ait][0]++; + } + } + } + } + size_t bestAnnot = 0; + float bestF = -1; + for(std::set::iterator it = u.third.begin(); + it != u.third.end(); ++it) { + Stats localStats = stats; + localStats[0] += statsPerAnnotator[*it][0]; + localStats[1] += statsPerAnnotator[*it][1]; + localStats[2] += statsPerAnnotator[*it][2]; + if(m_verbose) + std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl; + float f = FScore(localStats); + if(m_verbose) + std::cerr << f << std::endl; + if(f > bestF) { + bestF = f; + bestAnnot = *it; + } + } + if(m_verbose) + std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl; + stats[0] += statsPerAnnotator[bestAnnot][0]; + stats[1] += statsPerAnnotator[bestAnnot][1]; + stats[2] += statsPerAnnotator[bestAnnot][2]; + } + + void SufStats(const std::string &sStr, size_t i, Stats &stats) { + std::string temp = sStr; + + Sentence s; + boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on); + + Unit &unit = m_m2[i]; + + Matrix d; + TrackMatrix bt; + size_t distance = LevenshteinMatrix(unit.first, s, d, bt); + + std::vector Vs(unit.third.size()); + std::vector Es(unit.third.size()); + + if(distance > unit.first.size()) { + std::cerr << "Levenshtein distance is greater than source size." << std::endl; + stats[0] = 0; + stats[1] = distance; + stats[2] = 0; + stats[3] = unit.first.size(); + return; + } else if(distance > 0) { + for(size_t j = 0; j < unit.third.size(); j++) { + BuildGraph(bt, Vs[j], Es[j]); + AddWeights(Es[j], unit, j); + BellmanFord(Vs[j], Es[j]); + } + } + AddStats(Es, unit, stats, i); + stats[3] = unit.first.size(); + } + + + float FScore(const Stats& stats) { + float p = 1.0; + if(stats[1] != 0) + p = (float)stats[0] / (float)stats[1]; + + float r = 1.0; + if(stats[2] != 0) + r = (float)stats[0] / (float)stats[2]; + + float denom = (m_beta * m_beta * p + r); + float f = 0.0; + if(denom != 0) + f = ((1 + m_beta * m_beta) * p * r) / denom; + return f; + } + + void FScore(const Stats& stats, float &p, float &r, float &f) { + p = 1.0; + if(stats[1] != 0) + p = (float)stats[0] / (float)stats[1]; + + r = 1.0; + if(stats[2] != 0) + r = (float)stats[0] / (float)stats[2]; + + float denom = (m_beta * m_beta * p + r); + f = 0.0; + if(denom != 0) + f = ((1 + m_beta * m_beta) * p * r) / denom; + } }; } diff --git a/mert/M2Scorer.cpp b/mert/M2Scorer.cpp index 183d91273..f7e276631 100644 --- a/mert/M2Scorer.cpp +++ b/mert/M2Scorer.cpp @@ -45,90 +45,92 @@ float M2Scorer::calculateScore(const vector& comps) const if (comps.size() != NumberOfScores()) { throw runtime_error("Size of stat vector for M2Scorer is not " + NumberOfScores()); } - + float beta = beta_; - - + + float p = 0.0; float r = 0.0; float f = 0.0; - + if(comps[1] != 0) p = comps[0] / (double)comps[1]; else p = 1.0; - + if(comps[2] != 0) r = comps[0] / (double)comps[2]; else r = 1.0; - + float denom = beta * beta * p + r; if(denom != 0) f = (1.0 + beta * beta) * p * r / denom; else f = 0.0; - if(verbose_) + if(verbose_) std::cerr << comps[0] << " " << comps[1] << " " << comps[2] << std::endl; - if(verbose_) + if(verbose_) std::cerr << p << " " << r << " " << f << std::endl; - + return f; } -float M2Scorer::getReferenceLength(const vector& comps) const { +float M2Scorer::getReferenceLength(const vector& comps) const +{ return comps[3]; } -std::vector randomStats(float decay, int max) { +std::vector randomStats(float decay, int max) +{ int gold = rand() % max; int prop = rand() % max; int corr = 0.0; - + if(std::min(prop, gold) > 0) corr = rand() % std::min(prop, gold); - + //std::cerr << corr << " " << prop << " " << gold << std::endl; - + std::vector stats(3, 0.0); stats[0] = corr * decay; stats[1] = prop * decay; stats[2] = gold * decay; - + return stats; } float sentenceM2(const std::vector& stats) { float beta = 0.5; - + std::vector smoothStats(3, 0.0); // = randomStats(0.001, 5); - smoothStats[0] += stats[0]; - smoothStats[1] += stats[1]; - smoothStats[2] += stats[2]; - + smoothStats[0] += stats[0]; + smoothStats[1] += stats[1]; + smoothStats[2] += stats[2]; + float p = 0.0; float r = 0.0; float f = 0.0; - + if(smoothStats[1] != 0) p = smoothStats[0] / smoothStats[1]; else p = 1.0; - + if(smoothStats[2] != 0) r = smoothStats[0] / smoothStats[2]; else r = 1.0; - + float denom = beta * beta * p + r; if(denom != 0) f = (1.0 + beta * beta) * p * r / denom; else f = 0.0; - + return f; } diff --git a/mert/M2Scorer.h b/mert/M2Scorer.h index b4da298ac..2a807e447 100644 --- a/mert/M2Scorer.h +++ b/mert/M2Scorer.h @@ -31,15 +31,15 @@ public: virtual float calculateScore(const std::vector& comps) const; virtual float getReferenceLength(const std::vector& comps) const; -private: - float beta_; +private: + float beta_; int max_unchanged_words_; bool truecase_; bool verbose_; M2::M2 m2_; - + std::map, std::vector > seen_; - + // no copying allowed M2Scorer(const M2Scorer&); M2Scorer& operator=(const M2Scorer&); diff --git a/moses/FF/CorrectionPattern.cpp b/moses/FF/CorrectionPattern.cpp index 04a62b0ec..915eaff2c 100644 --- a/moses/FF/CorrectionPattern.cpp +++ b/moses/FF/CorrectionPattern.cpp @@ -23,21 +23,22 @@ namespace Moses using namespace std; -std::string MakePair(const std::string &s1, const std::string &s2, bool general) { +std::string MakePair(const std::string &s1, const std::string &s2, bool general) +{ std::vector sourceList; std::vector targetList; - + if(general) { Diffs diffs = CreateDiff(s1, s2); - + size_t i = 0, j = 0; char lastType = 'm'; - - std::string source, target; + + std::string source, target; std::string match; - + int count = 1; - + BOOST_FOREACH(Diff type, diffs) { if(type == 'm') { if(lastType != 'm') { @@ -46,7 +47,7 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general) } source.clear(); target.clear(); - + if(s1[i] == '+') { if(match.size() >= 3) { sourceList.push_back("(\\w{3,})·"); @@ -54,56 +55,51 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general) sprintf((char*)temp.c_str(), "%d", count); targetList.push_back("\\" + temp + "·"); count++; - } - else { + } else { sourceList.push_back(match + "·"); - targetList.push_back(match + "·"); + targetList.push_back(match + "·"); } match.clear(); - } - else + } else match.push_back(s1[i]); - + i++; j++; - } - else if(type == 'd') { + } else if(type == 'd') { if(s1[i] == '+') source += "·"; else source.push_back(s1[i]); i++; - } - else if(type == 'i') { + } else if(type == 'i') { if(s2[j] == '+') target += "·"; else target.push_back(s2[j]); j++; } - if(type != 'm' && !match.empty()) { + if(type != 'm' && !match.empty()) { if(match.size() >= 3) { sourceList.push_back("(\\w{3,})"); std::string temp = "1"; sprintf((char*)temp.c_str(), "%d", count); targetList.push_back("\\" + temp); count++; - } - else { + } else { sourceList.push_back(match); - targetList.push_back(match); + targetList.push_back(match); } - + match.clear(); } - + lastType = type; } if(lastType != 'm') { sourceList.push_back(source); targetList.push_back(target); } - + if(!match.empty()) { if(match.size() >= 3) { sourceList.push_back("(\\w{3,})"); @@ -111,45 +107,42 @@ std::string MakePair(const std::string &s1, const std::string &s2, bool general) sprintf((char*)temp.c_str(), "%d", count); targetList.push_back("\\"+ temp); count++; - } - else { + } else { sourceList.push_back(match); - targetList.push_back(match); + targetList.push_back(match); } } match.clear(); - } - else { + } else { std::string cs1 = s1; std::string cs2 = s2; boost::replace_all(cs1, "+", "·"); boost::replace_all(cs2, "+", "·"); - + sourceList.push_back(cs1); targetList.push_back(cs2); } - + std::stringstream out; out << "sub(«"; out << boost::join(sourceList, ""); out << "»,«"; out << boost::join(targetList, ""); out << "»)"; - + return out.str(); } -std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const { +std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const +{ std::stringstream out; if(s1.empty()) { out << "ins(«" << boost::join(s2, "·") << "»)"; return out.str(); - } - else if(s2.empty()) { + } else if(s2.empty()) { out << "del(«" << boost::join(s1, "·") << "»)"; return out.str(); - } - else { + } else { typename Tokens::value_type v1 = boost::join(s1, "+"); typename Tokens::value_type v2 = boost::join(s2, "+"); out << MakePair(v1, v2, m_general); @@ -158,36 +151,36 @@ std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Token } std::vector GetContext(size_t pos, - size_t len, - size_t window, - const InputType &input, - const InputPath &inputPath, - const std::vector& factorTypes, - bool isRight) { + size_t len, + size_t window, + const InputType &input, + const InputPath &inputPath, + const std::vector& factorTypes, + bool isRight) +{ const Sentence& sentence = static_cast(input); - const Range& range = inputPath.GetWordsRange(); - - int leftPos = range.GetStartPos() + pos - len - 1; - int rightPos = range.GetStartPos() + pos; - + const Range& range = inputPath.GetWordsRange(); + + int leftPos = range.GetStartPos() + pos - len - 1; + int rightPos = range.GetStartPos() + pos; + std::vector contexts; - + for(int length = 1; length <= (int)window; ++length) { std::vector current; if(!isRight) { for(int i = 0; i < length; i++) { if(leftPos - i >= 0) { current.push_back(sentence.GetWord(leftPos - i).GetString(factorTypes, false)); - } - else { + } else { current.push_back(""); } } - + if(current.back() == "" && current.size() >= 2 && current[current.size()-2] == "") continue; - + std::reverse(current.begin(), current.end()); contexts.push_back("left(«" + boost::join(current, "·") + "»)_"); } @@ -195,8 +188,7 @@ std::vector GetContext(size_t pos, for(int i = 0; i < length; i++) { if(rightPos + i < (int)sentence.GetSize()) { current.push_back(sentence.GetWord(rightPos + i).GetString(factorTypes, false)); - } - else { + } else { current.push_back(""); } } @@ -206,7 +198,7 @@ std::vector GetContext(size_t pos, contexts.push_back("_right(«" + boost::join(current, "·") + "»)"); } - } + } return contexts; } @@ -214,8 +206,9 @@ std::vector CorrectionPattern::CreatePattern(const Tokens &s1, const Tokens &s2, const InputType &input, - const InputPath &inputPath) const { - + const InputPath &inputPath) const +{ + Diffs diffs = CreateDiff(s1, s2); size_t i = 0, j = 0; char lastType = 'm'; @@ -226,20 +219,20 @@ CorrectionPattern::CreatePattern(const Tokens &s1, if(lastType != 'm') { std::string pattern = CreateSinglePattern(source, target); patternList.push_back(pattern); - + if(m_context > 0) { std::vector leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false); std::vector rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true); - + BOOST_FOREACH(std::string left, leftContexts) - patternList.push_back(left + pattern); + patternList.push_back(left + pattern); BOOST_FOREACH(std::string right, rightContexts) - patternList.push_back(pattern + right); - + patternList.push_back(pattern + right); + BOOST_FOREACH(std::string left, leftContexts) - BOOST_FOREACH(std::string right, rightContexts) - patternList.push_back(left + pattern + right); + BOOST_FOREACH(std::string right, rightContexts) + patternList.push_back(left + pattern + right); } } source.clear(); @@ -250,12 +243,10 @@ CorrectionPattern::CreatePattern(const Tokens &s1, } i++; j++; - } - else if(type == 'd') { + } else if(type == 'd') { source.push_back(s1[i]); i++; - } - else if(type == 'i') { + } else if(type == 'i') { target.push_back(s2[j]); j++; } @@ -264,23 +255,23 @@ CorrectionPattern::CreatePattern(const Tokens &s1, if(lastType != 'm') { std::string pattern = CreateSinglePattern(source, target); patternList.push_back(pattern); - + if(m_context > 0) { std::vector leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false); std::vector rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true); - + BOOST_FOREACH(std::string left, leftContexts) - patternList.push_back(left + pattern); + patternList.push_back(left + pattern); BOOST_FOREACH(std::string right, rightContexts) - patternList.push_back(pattern + right); - + patternList.push_back(pattern + right); + BOOST_FOREACH(std::string left, leftContexts) - BOOST_FOREACH(std::string right, rightContexts) - patternList.push_back(left + pattern + right); + BOOST_FOREACH(std::string right, rightContexts) + patternList.push_back(left + pattern + right); } } - + return patternList; } @@ -308,36 +299,36 @@ void CorrectionPattern::SetParameter(const std::string& key, const std::string& } void CorrectionPattern::EvaluateWithSourceContext(const InputType &input - , const InputPath &inputPath - , const TargetPhrase &targetPhrase - , const StackVec *stackVec - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection *estimatedFutureScore) const + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore) const { ComputeFeatures(input, inputPath, targetPhrase, &scoreBreakdown); } void CorrectionPattern::ComputeFeatures( - const InputType &input, - const InputPath &inputPath, - const TargetPhrase& target, - ScoreComponentCollection* accumulator) const + const InputType &input, + const InputPath &inputPath, + const TargetPhrase& target, + ScoreComponentCollection* accumulator) const { const Phrase &source = inputPath.GetPhrase(); - + std::vector sourceTokens; for(size_t i = 0; i < source.GetSize(); ++i) sourceTokens.push_back(source.GetWord(i).GetString(m_factors, false)); - + std::vector targetTokens; for(size_t i = 0; i < target.GetSize(); ++i) targetTokens.push_back(target.GetWord(i).GetString(m_factors, false)); - + std::vector patternList = CreatePattern(sourceTokens, targetTokens, input, inputPath); for(size_t i = 0; i < patternList.size(); ++i) accumulator->PlusEquals(this, patternList[i], 1); - /* + /* BOOST_FOREACH(std::string w, sourceTokens) std::cerr << w << " "; std::cerr << std::endl; diff --git a/moses/FF/CorrectionPattern.h b/moses/FF/CorrectionPattern.h index 63ca125c6..516a56ce2 100644 --- a/moses/FF/CorrectionPattern.h +++ b/moses/FF/CorrectionPattern.h @@ -29,41 +29,41 @@ public: bool IsUseable(const FactorMask &mask) const; void EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const + {} + + virtual void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const {} - - virtual void EvaluateWithSourceContext(const InputType &input - , const InputPath &inputPath - , const TargetPhrase &targetPhrase - , const StackVec *stackVec - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection *estimatedFutureScore = NULL) const; - void EvaluateTranslationOptionListWithSourceContext(const InputType &input - , const TranslationOptionList &translationOptionList) const - {} - void EvaluateWhenApplied(const Hypothesis& hypo, - ScoreComponentCollection* accumulator) const + ScoreComponentCollection* accumulator) const {} void EvaluateWhenApplied(const ChartHypothesis &hypo, - ScoreComponentCollection* accumulator) const + ScoreComponentCollection* accumulator) const {} void ComputeFeatures(const InputType &input, const InputPath &inputPath, const TargetPhrase& targetPhrase, ScoreComponentCollection* accumulator) const; - + void SetParameter(const std::string& key, const std::string& value); - + std::vector CreatePattern(const Tokens &s1, const Tokens &s2, const InputType &input, const InputPath &inputPath) const; - + std::string CreateSinglePattern(const Tokens &s1, const Tokens &s2) const; }; diff --git a/moses/FF/Diffs.h b/moses/FF/Diffs.h index bf0a7cefc..8935d1fb9 100644 --- a/moses/FF/Diffs.h +++ b/moses/FF/Diffs.h @@ -11,22 +11,21 @@ typedef std::vector Diffs; template void CreateDiffRec(size_t** c, - const Sequence &s1, - const Sequence &s2, - size_t start, - size_t i, - size_t j, - Diffs& diffs, - Pred pred) { + const Sequence &s1, + const Sequence &s2, + size_t start, + size_t i, + size_t j, + Diffs& diffs, + Pred pred) +{ if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) { CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred); diffs.push_back(Diff('m')); - } - else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) { + } else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) { CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred); diffs.push_back(Diff('i')); - } - else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) { + } else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) { CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred); diffs.push_back(Diff('d')); } @@ -34,17 +33,18 @@ void CreateDiffRec(size_t** c, template Diffs CreateDiff(const Sequence& s1, - const Sequence& s2, - Pred pred) { - + const Sequence& s2, + Pred pred) +{ + Diffs diffs; - + size_t n = s2.size(); - + int start = 0; int m_end = s1.size() - 1; int n_end = s2.size() - 1; - + while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) { diffs.push_back(Diff('m')); start++; @@ -53,49 +53,51 @@ Diffs CreateDiff(const Sequence& s1, m_end--; n_end--; } - + size_t m_new = m_end - start + 1; size_t n_new = n_end - start + 1; - + size_t** c = new size_t*[m_new + 1]; for(size_t i = 0; i <= m_new; ++i) { c[i] = new size_t[n_new + 1]; c[i][0] = 0; } for(size_t j = 0; j <= n_new; ++j) - c[0][j] = 0; + c[0][j] = 0; for(size_t i = 1; i <= m_new; ++i) for(size_t j = 1; j <= n_new; ++j) if(pred(s1[i - 1 + start], s2[j - 1 + start])) c[i][j] = c[i-1][j-1] + 1; else c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j]; - + CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred); - + for(size_t i = 0; i <= m_new; ++i) delete[] c[i]; delete[] c; - + for (size_t i = n_end + 1; i < n; ++i) diffs.push_back(Diff('m')); - + return diffs; } template -Diffs CreateDiff(const Sequence& s1, const Sequence& s2) { +Diffs CreateDiff(const Sequence& s1, const Sequence& s2) +{ return CreateDiff(s1, s2, std::equal_to()); } template -void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats) { +void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats) +{ if(sig.size() != stats.size()) throw "Signature size differs from score array size."; - + size_t m = 0, d = 0, i = 0, s = 0; - Diffs diff = CreateDiff(s1, s2); - + Diffs diff = CreateDiff(s1, s2); + for(int j = 0; j < (int)diff.size(); ++j) { if(diff[j] == 'm') m++; @@ -109,27 +111,36 @@ void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& sta k++; } j += k; - } - else if(diff[j] == 'i') + } else if(diff[j] == 'i') i++; } - + for(size_t j = 0; j < sig.size(); ++j) { switch (sig[j]) { - case 'l': stats[j] += d + i + s; break; - case 'm': stats[j] += m; break; - case 'd': stats[j] += d; break; - case 'i': stats[j] += i; break; - case 's': stats[j] += s; break; - case 'r': - float macc = 1; - if (d + i + s + m) - macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m); - if(macc > 0) - stats[j] += log(macc); - else - stats[j] += log(1.0/(float)(d + i + s + m + 1)); - break; + case 'l': + stats[j] += d + i + s; + break; + case 'm': + stats[j] += m; + break; + case 'd': + stats[j] += d; + break; + case 'i': + stats[j] += i; + break; + case 's': + stats[j] += s; + break; + case 'r': + float macc = 1; + if (d + i + s + m) + macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m); + if(macc > 0) + stats[j] += log(macc); + else + stats[j] += log(1.0/(float)(d + i + s + m + 1)); + break; } } } diff --git a/moses/FF/EditOps.cpp b/moses/FF/EditOps.cpp index fdca93963..fa66acf1c 100644 --- a/moses/FF/EditOps.cpp +++ b/moses/FF/EditOps.cpp @@ -21,14 +21,15 @@ namespace Moses using namespace std; -std::string ParseScores(const std::string &line, const std::string& defaultScores) { +std::string ParseScores(const std::string &line, const std::string& defaultScores) +{ std::vector toks = Tokenize(line); UTIL_THROW_IF2(toks.empty(), "Empty line"); for (size_t i = 1; i < toks.size(); ++i) { std::vector args = TokenizeFirstOnly(toks[i], "="); UTIL_THROW_IF2(args.size() != 2, - "Incorrect format for feature function arg: " << toks[i]); + "Incorrect format for feature function arg: " << toks[i]); if (args[0] == "scores") { return args[1]; @@ -62,30 +63,29 @@ void EditOps::Load() { } void EditOps::EvaluateInIsolation(const Phrase &source - , const TargetPhrase &target - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &target + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { ComputeFeatures(source, target, &scoreBreakdown); } void EditOps::ComputeFeatures( - const Phrase &source, - const TargetPhrase& target, - ScoreComponentCollection* accumulator) const + const Phrase &source, + const TargetPhrase& target, + ScoreComponentCollection* accumulator) const { std::vector ops(GetNumScoreComponents(), 0); - + if(m_chars) { std::vector factors; factors.push_back(m_factorType); - + std::string sourceStr = source.GetStringRep(factors); std::string targetStr = target.GetStringRep(factors); - + AddStats(sourceStr, targetStr, m_scores, ops); - } - else { + } else { std::vector sourceTokens; //std::cerr << "Ed src: "; for(size_t i = 0; i < source.GetSize(); ++i) { @@ -94,7 +94,7 @@ void EditOps::ComputeFeatures( //std::cerr << sourceTokens.back() << " "; } //std::cerr << std::endl; - + std::vector targetTokens; //std::cerr << "Ed trg: "; for(size_t i = 0; i < target.GetSize(); ++i) { @@ -103,10 +103,10 @@ void EditOps::ComputeFeatures( //std::cerr << targetTokens.back() << " "; } //std::cerr << std::endl; - + AddStats(sourceTokens, targetTokens, m_scores, ops); } - + accumulator->PlusEquals(this, ops); } diff --git a/moses/FF/EditOps.h b/moses/FF/EditOps.h index b1a1cef7e..e7e7dd315 100644 --- a/moses/FF/EditOps.h +++ b/moses/FF/EditOps.h @@ -32,26 +32,26 @@ public: void Load(); virtual void EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; - + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; + void EvaluateWithSourceContext(const InputType &input - , const InputPath &inputPath - , const TargetPhrase &targetPhrase - , const StackVec *stackVec - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection *estimatedFutureScore = NULL) const + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const {} void EvaluateWhenApplied(const Hypothesis& hypo, - ScoreComponentCollection* accumulator) const + ScoreComponentCollection* accumulator) const {} void EvaluateWhenApplied(const ChartHypothesis &hypo, - ScoreComponentCollection* accumulator) const + ScoreComponentCollection* accumulator) const + {} + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const {} - void EvaluateTranslationOptionListWithSourceContext(const InputType &input - , const TranslationOptionList &translationOptionList) const - {} void ComputeFeatures(const Phrase &source, const TargetPhrase& targetPhrase, diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index f335d3814..8713af8bf 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -1540,6 +1540,150 @@ analysis-precision rerun-on-change: precision-by-coverage-base final-model: yes +[QUALITY-ESTIMATION] single +tokenize-input + in: raw-input + out: tokenized-input + default-name: quality-estimation/input.tok + pass-unless: input-tokenizer + template: $input-tokenizer < IN > OUT +tokenize-input-devtest + in: raw-input-devtest + out: tokenized-input-devtest + default-name: quality-estimation/input.devtest.tok + pass-unless: input-tokenizer + template: $input-tokenizer < IN > OUT +lowercase-input + in: tokenized-input + out: truecased-input + default-name: quality-estimation/input.lc + pass-unless: input-lowercaser + ignore-if: input-truecaser + template: $input-lowercaser < IN > OUT +lowercase-input-devtest + in: tokenized-input-devtest + out: truecased-input-devtest + default-name: quality-estimation/input.devtest.lc + pass-unless: input-lowercaser + ignore-if: input-truecaser + template: $input-lowercaser < IN > OUT +truecase-input + in: tokenized-input TRUECASER:truecase-model + out: truecased-input + rerun-on-change: input-truecaser + default-name: quality-estimation/input.tc + ignore-unless: input-truecaser + template: $input-truecaser -model IN1.$input-extension < IN > OUT +truecase-input-devtest + in: tokenized-input-devtest TRUECASER:truecase-model + out: truecased-input-devtest + rerun-on-change: input-truecaser + ignore-unless: input-truecaser + default-name: quality-estimation/input.devtest.tc + template: $input-truecaser -model IN1.$input-extension < IN > OUT +split-input + in: truecased-input SPLITTER:splitter-model + out: split-input + rerun-on-change: input-splitter + default-name: quality-estimation/input.split + pass-unless: input-splitter + template: $input-splitter -model IN1.$input-extension < IN > OUT +split-input-devtest + in: truecased-input-devtest SPLITTER:splitter-model + out: split-input-devtest + rerun-on-change: input-splitter + default-name: quality-estimation/input.devtest.split + pass-unless: input-splitter + template: $input-splitter -model IN1.$input-extension < IN > OUT +tokenize-reference + in: raw-reference + out: tokenized-reference + default-name: quality-estimation/reference.tok + pass-unless: output-tokenizer + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-tokenizer < IN > OUT +tokenize-reference-devtest + in: raw-reference-devtest + out: tokenized-reference-devtest + default-name: quality-estimation/reference.devtest.tok + pass-unless: output-tokenizer + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-tokenizer < IN > OUT +lowercase-reference + in: tokenized-reference + out: truecased-reference + default-name: quality-estimation/reference.lc + pass-unless: output-lowercaser + ignore-if: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-lowercaser < IN > OUT +lowercase-reference-devtest + in: tokenized-reference-devtest + out: truecased-reference-devtest + default-name: quality-estimation/reference.devtest.lc + pass-unless: output-lowercaser + ignore-if: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-lowercaser < IN > OUT +truecase-reference + in: tokenized-reference TRUECASER:truecase-model + out: truecased-reference + rerun-on-change: output-truecaser + default-name: quality-estimation/reference.tc + ignore-unless: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-truecaser -model IN1.$output-extension < IN > OUT +truecase-reference-devtest + in: tokenized-reference-devtest TRUECASER:truecase-model + out: truecased-reference-devtest + rerun-on-change: output-truecaser + default-name: quality-estimation/reference.devtest.tc + ignore-unless: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-truecaser -model IN1.$output-extension < IN > OUT +decode + in: TUNING:config-with-reused-weights split-input + out: rich-output + default-name: quality-estimation/output + template: $decoder -v 0 -tt -f IN < IN1 > OUT + error: Translation was not performed correctly + not-error: trans: No such file or directory +decode-devtest + in: TUNING:config-with-reused-weights split-input-devtest + out: rich-output-devtest + default-name: quality-estimation/output-devtest + template: $decoder -v 0 -tt -f IN < IN1 > OUT + error: Translation was not performed correctly + not-error: trans: No such file or directory +remove-markup + in: rich-output + out: cleaned-output + default-name: quality-estimation/tokenized-output + template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT +remove-markup-devtest + in: rich-output-devtest + out: cleaned-output-devtest + default-name: quality-estimation/tokenized-output-devtest + template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT +score-output + in: cleaned-output truecased-reference + out: scored-output + default-name: quality-estimation/output-scored + tmp-name: quality-estimation/ter + template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT +score-output-devtest + in: cleaned-output-devtest truecased-reference-devtest + out: scored-output-devtest + default-name: quality-estimation/output-scored-devtest + tmp-name: quality-estimation/ter-devtest + template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT +train + in: input rich-output scored-output input-devtest rich-output-devtest scored-output-devtest + out: quality-estimation-model + default-name: quality-estimation/model + template: $trainer --train-rich IN1 --train-ter IN2 --eval-rich IN4 --eval-ter IN5 --model OUT + final-model: yes + [REPORTING] single report in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis diff --git a/scripts/ems/support/create-xml.perl b/scripts/ems/support/create-xml.perl new file mode 100755 index 000000000..610c2ccf8 --- /dev/null +++ b/scripts/ems/support/create-xml.perl @@ -0,0 +1,42 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +my ($type) = @ARGV; +if ($type =~ /^s/i) { + print "\n"; + print "\n"; +} +elsif ($type =~ /^t/i) { + print "\n"; + print "\n"; +} +elsif ($type =~ /^r/i) { + print "\n"; + print "\n"; +} +else { + die("ERROR: specify source / target / ref"); +} + +my $i = 0; +while() { + chomp; + print "$_\n"; +} + +print "\n"; + +if ($type =~ /^s/i) { + print "\n"; +} +elsif ($type =~ /^t/i) { + print "\n"; +} +elsif ($type =~ /^r/i) { + print "\n"; +} diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index 3b02bceaf..1e5820dd5 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -9,7 +9,16 @@ use strict; $|++; while() { - s/ \|\d+\-\d+\| / /g; - s/ \|\d+\-\d+\|$//; - print $_; + chop; + s/\|[^\|]+\|//g; + s/\s+/ /g; + s/^ //; + s/ $//; + print $_."\n"; } + +#while() { +# s/ \|\d+\-\d+\| / /g; +# s/ \|\d+\-\d+\|$//; +# print $_; +#} diff --git a/scripts/ems/support/ter.perl b/scripts/ems/support/ter.perl new file mode 100644 index 000000000..1bae6f146 --- /dev/null +++ b/scripts/ems/support/ter.perl @@ -0,0 +1,15 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use strict; +use FindBin qw($RealBin); + +my ($jar, $hyp,$ref,$tmp) = @ARGV; +`mkdir -p $tmp`; +`$RealBin/create-xml.perl test < $hyp > $tmp/hyp`; +`$RealBin/create-xml.perl ref < $ref > $tmp/ref`; +`java -jar $jar -h $tmp/hyp -r $tmp/ref -o ter -n $tmp/out`; +print `cat $tmp/out.ter`; +