mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 11:28:48 +03:00
HWCM for MERT
This commit is contained in:
parent
2c66ae5e34
commit
f40bb2c53c
@ -164,10 +164,9 @@ void Data::loadNBest(const string &file)
|
||||
++it; // skip model score.
|
||||
|
||||
if (it) {
|
||||
++it;
|
||||
alignment = it->as_string(); //fifth field (if present) is either phrase or word alignment
|
||||
++it;
|
||||
if (it) {
|
||||
++it;
|
||||
alignment = it->as_string(); //sixth field (if present) is word alignment
|
||||
}
|
||||
}
|
||||
|
165
mert/HwcmScorer.cpp
Normal file
165
mert/HwcmScorer.cpp
Normal file
@ -0,0 +1,165 @@
|
||||
#include "HwcmScorer.h"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "ScoreStats.h"
|
||||
#include "Util.h"
|
||||
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
// HWCM score (Liu and Gildea, 2005). Implements F1 instead of precision for better modelling of hypothesis length.
|
||||
// assumes dependency trees on target side (generated by scripts/training/wrappers/conll2mosesxml.py ; use with option --brackets for reference).
|
||||
// reads reference trees from separate file {REFERENCE_FILE}.trees to support mix of string-based and tree-based metrics.
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
HwcmScorer::HwcmScorer(const string& config)
|
||||
: StatisticsBasedScorer("HWCM",config) {}
|
||||
|
||||
HwcmScorer::~HwcmScorer() {}
|
||||
|
||||
void HwcmScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
{
|
||||
// For each line in the reference file, create a tree object
|
||||
if (referenceFiles.size() != 1) {
|
||||
throw runtime_error("HWCM only supports a single reference");
|
||||
}
|
||||
m_ref_trees.clear();
|
||||
m_ref_hwc.clear();
|
||||
ifstream in((referenceFiles[0] + ".trees").c_str());
|
||||
if (!in) {
|
||||
throw runtime_error("Unable to open " + referenceFiles[0] + ".trees");
|
||||
}
|
||||
string line;
|
||||
while (getline(in,line)) {
|
||||
line = this->preprocessSentence(line);
|
||||
TreePointer tree (boost::make_shared<InternalTree>(line));
|
||||
m_ref_trees.push_back(tree);
|
||||
vector<map<string, int> > hwc (kHwcmOrder);
|
||||
vector<string> history(kHwcmOrder);
|
||||
extractHeadWordChain(tree, history, hwc);
|
||||
m_ref_hwc.push_back(hwc);
|
||||
vector<int> totals(kHwcmOrder);
|
||||
for (size_t i = 0; i < kHwcmOrder; i++) {
|
||||
for (map<string, int>::const_iterator it = m_ref_hwc.back()[i].begin(); it != m_ref_hwc.back()[i].end(); it++) {
|
||||
totals[i] += it->second;
|
||||
}
|
||||
}
|
||||
m_ref_lengths.push_back(totals);
|
||||
}
|
||||
TRACE_ERR(endl);
|
||||
|
||||
}
|
||||
|
||||
void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc) {
|
||||
|
||||
if (tree->GetLength() > 0) {
|
||||
string head = getHead(tree);
|
||||
|
||||
if (head.empty()) {
|
||||
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) {
|
||||
extractHeadWordChain(*it, history, hwc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
vector<string> new_history(kHwcmOrder);
|
||||
new_history[0] = head;
|
||||
hwc[0][head]++;
|
||||
for (size_t hist_idx = 0; hist_idx < kHwcmOrder-1; hist_idx++) {
|
||||
if (!history[hist_idx].empty()) {
|
||||
string chain = history[hist_idx] + " " + head;
|
||||
hwc[hist_idx+1][chain]++;
|
||||
if (hist_idx+2 < kHwcmOrder) {
|
||||
new_history[hist_idx+1] = chain;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) {
|
||||
extractHeadWordChain(*it, new_history, hwc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string HwcmScorer::getHead(TreePointer tree) {
|
||||
// assumption (only true for dependency parse: each constituent has a preterminal label, and corresponding terminal is head)
|
||||
// if constituent has multiple preterminals, first one is picked; if it has no preterminals, empty string is returned
|
||||
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it)
|
||||
{
|
||||
TreePointer child = *it;
|
||||
|
||||
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
|
||||
return child->GetChildren()[0]->GetLabel();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
|
||||
}
|
||||
|
||||
void HwcmScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
if (sid >= m_ref_trees.size()) {
|
||||
stringstream msg;
|
||||
msg << "Sentence id (" << sid << ") not found in reference set";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
|
||||
string sentence = this->preprocessSentence(text);
|
||||
|
||||
// if sentence has '|||', assume that tree is in second position (n-best-list);
|
||||
// otherwise, assume it is in first position (calling 'evaluate' with tree as reference)
|
||||
util::TokenIter<util::MultiCharacter> it(sentence, util::MultiCharacter("|||"));
|
||||
++it;
|
||||
if (it) {
|
||||
sentence = it->as_string();
|
||||
}
|
||||
|
||||
TreePointer tree (boost::make_shared<InternalTree>(sentence));
|
||||
vector<map<string, int> > hwc_test (kHwcmOrder);
|
||||
vector<string> history(kHwcmOrder);
|
||||
extractHeadWordChain(tree, history, hwc_test);
|
||||
|
||||
ostringstream stats;
|
||||
for (size_t i = 0; i < kHwcmOrder; i++) {
|
||||
int correct = 0;
|
||||
int test_total = 0;
|
||||
for (map<string, int>::const_iterator it = hwc_test[i].begin(); it != hwc_test[i].end(); it++) {
|
||||
test_total += it->second;
|
||||
map<string, int>::const_iterator it2 = m_ref_hwc[sid][i].find(it->first);
|
||||
if (it2 != m_ref_hwc[sid][i].end()) {
|
||||
correct += std::min(it->second, it2->second);
|
||||
}
|
||||
}
|
||||
stats << correct << " " << test_total << " " << m_ref_lengths[sid][i] << " " ;
|
||||
}
|
||||
|
||||
string stats_str = stats.str();
|
||||
entry.set(stats_str);
|
||||
}
|
||||
|
||||
float HwcmScorer::calculateScore(const vector<int>& comps) const
|
||||
{
|
||||
float precision = 0;
|
||||
float recall = 0;
|
||||
for (size_t i = 0; i < kHwcmOrder; i++) {
|
||||
float matches = comps[i*3];
|
||||
float test_total = comps[1+(i*3)];
|
||||
float ref_total = comps[2+(i*3)];
|
||||
if (test_total > 0) {
|
||||
precision += matches/test_total;
|
||||
}
|
||||
if (ref_total > 0) {
|
||||
recall += matches/ref_total;
|
||||
}
|
||||
}
|
||||
|
||||
precision /= (float)kHwcmOrder;
|
||||
recall /= (float)kHwcmOrder;
|
||||
return (2*precision*recall)/(precision+recall); // f1-score
|
||||
}
|
||||
|
||||
}
|
60
mert/HwcmScorer.h
Normal file
60
mert/HwcmScorer.h
Normal file
@ -0,0 +1,60 @@
|
||||
#ifndef MERT_HWCM_SCORER_H_
|
||||
#define MERT_HWCM_SCORER_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "StatisticsBasedScorer.h"
|
||||
#include "moses/FF/InternalTree.h"
|
||||
|
||||
using Moses::TreePointer;
|
||||
using Moses::InternalTree;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
class ScoreStats;
|
||||
const size_t kHwcmOrder = 4;
|
||||
|
||||
/**
|
||||
* HWCM scoring (Liu and Gildea 2005), but F1 instead of precision.
|
||||
*/
|
||||
class HwcmScorer: public StatisticsBasedScorer
|
||||
{
|
||||
public:
|
||||
explicit HwcmScorer(const std::string& config = "");
|
||||
~HwcmScorer();
|
||||
|
||||
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
|
||||
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
|
||||
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return kHwcmOrder*3;
|
||||
}
|
||||
|
||||
virtual float calculateScore(const std::vector<int>& comps) const;
|
||||
|
||||
//TODO: actually, we use trees which we store in place of alignment. Maybe use something analogous to Phrase Properties to cleanly store trees?
|
||||
bool useAlignment() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
// data extracted from reference files
|
||||
std::vector<TreePointer> m_ref_trees;
|
||||
std::vector<std::vector<std::map<std::string, int> > > m_ref_hwc;
|
||||
std::vector<std::vector<int> > m_ref_lengths;
|
||||
|
||||
void extractHeadWordChain(TreePointer tree, std::vector<std::string> & history, std::vector<std::map<std::string, int> > & hwc);
|
||||
std::string getHead(TreePointer tree);
|
||||
|
||||
// no copying allowed
|
||||
HwcmScorer(const HwcmScorer&);
|
||||
HwcmScorer& operator=(const HwcmScorer&);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif // MERT_HWCM_SCORER_H_
|
@ -29,6 +29,8 @@ SemposOverlapping.cpp
|
||||
InterpolatedScorer.cpp
|
||||
Point.cpp
|
||||
PerScorer.cpp
|
||||
HwcmScorer.cpp
|
||||
../moses/FF/InternalTree.cpp
|
||||
Scorer.cpp
|
||||
ScorerFactory.cpp
|
||||
Optimizer.cpp
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "SemposScorer.h"
|
||||
#include "PermutationScorer.h"
|
||||
#include "MeteorScorer.h"
|
||||
#include "HwcmScorer.h"
|
||||
#include "Reference.h"
|
||||
|
||||
using namespace std;
|
||||
@ -32,6 +33,7 @@ vector<string> ScorerFactory::getTypes()
|
||||
types.push_back(string("SEMPOS"));
|
||||
types.push_back(string("LRSCORE"));
|
||||
types.push_back(string("METEOR"));
|
||||
types.push_back(string("HWCM"));
|
||||
return types;
|
||||
}
|
||||
|
||||
@ -56,6 +58,8 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config)
|
||||
return (PermutationScorer*) new PermutationScorer(type, config);
|
||||
} else if (type == "METEOR") {
|
||||
return new MeteorScorer(config);
|
||||
} else if (type == "HWCM") {
|
||||
return new HwcmScorer(config);
|
||||
} else {
|
||||
if (type.find(',') != string::npos) {
|
||||
return new InterpolatedScorer(type, config);
|
||||
|
Loading…
Reference in New Issue
Block a user