2012-02-20 04:46:08 +04:00
|
|
|
#ifndef MERT_SCORER_H_
|
|
|
|
#define MERT_SCORER_H_
|
2008-05-14 12:46:15 +04:00
|
|
|
|
|
|
|
#include <iostream>
|
2008-05-15 20:03:49 +04:00
|
|
|
#include <sstream>
|
2008-05-14 16:23:58 +04:00
|
|
|
#include <stdexcept>
|
2008-05-14 12:46:15 +04:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2008-05-15 23:09:01 +04:00
|
|
|
#include "Types.h"
|
2008-05-14 20:31:22 +04:00
|
|
|
#include "ScoreData.h"
|
2008-05-14 16:23:58 +04:00
|
|
|
|
2008-05-14 23:47:34 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2008-05-14 16:23:58 +04:00
|
|
|
class ScoreStats;
|
2008-05-14 12:46:15 +04:00
|
|
|
|
2008-05-14 23:47:34 +04:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Superclass of all scorers and dummy implementation.
|
|
|
|
*
|
|
|
|
* In order to add a new scorer it should be sufficient to override the members
|
|
|
|
* prepareStats(), setReferenceFiles() and score() (or calculateScore()).
|
|
|
|
*/
|
2011-02-24 15:42:19 +03:00
|
|
|
class Scorer
|
|
|
|
{
|
2012-02-01 16:19:25 +04:00
|
|
|
public:
|
2011-11-12 05:16:31 +04:00
|
|
|
Scorer(const string& name, const string& config);
|
2012-02-01 16:19:25 +04:00
|
|
|
virtual ~Scorer();
|
2008-06-24 23:27:18 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Return the number of statistics needed for the computation of the score.
|
|
|
|
*/
|
2011-11-12 05:58:14 +04:00
|
|
|
virtual size_t NumberOfScores() const {
|
2011-02-24 15:42:19 +03:00
|
|
|
cerr << "Scorer: 0" << endl;
|
|
|
|
return 0;
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Set the reference files. This must be called before prepareStats().
|
|
|
|
*/
|
2011-02-24 15:42:19 +03:00
|
|
|
virtual void setReferenceFiles(const vector<string>& referenceFiles) {
|
2012-02-01 16:26:47 +04:00
|
|
|
// do nothing
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Process the given guessed text, corresponding to the given reference sindex
|
|
|
|
* and add the appropriate statistics to the entry.
|
2011-11-12 03:58:23 +04:00
|
|
|
*/
|
2012-02-01 16:26:47 +04:00
|
|
|
virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry) {
|
|
|
|
// do nothing.
|
|
|
|
}
|
2008-06-24 23:27:18 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry) {
|
2012-02-01 13:13:00 +04:00
|
|
|
this->prepareStats(static_cast<size_t>(atoi(sindex.c_str())), text, entry);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-05-14 23:47:34 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Score using each of the candidate index, then go through the diffs
|
|
|
|
* applying each in turn, and calculating a new score each time.
|
|
|
|
*/
|
2011-02-24 15:42:19 +03:00
|
|
|
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
2011-11-12 05:58:14 +04:00
|
|
|
statscores_t& scores) const {
|
2011-02-24 15:42:19 +03:00
|
|
|
//dummy impl
|
2012-02-01 15:54:20 +04:00
|
|
|
if (!m_score_data) {
|
2011-02-24 15:42:19 +03:00
|
|
|
throw runtime_error("score data not loaded");
|
|
|
|
}
|
|
|
|
scores.push_back(0);
|
|
|
|
for (size_t i = 0; i < diffs.size(); ++i) {
|
|
|
|
scores.push_back(0);
|
|
|
|
}
|
|
|
|
}
|
2008-05-14 23:47:34 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Calculate the score of the sentences corresponding to the list of candidate
|
|
|
|
* indices. Each index indicates the 1-best choice from the n-best list.
|
|
|
|
*/
|
2011-11-12 05:58:14 +04:00
|
|
|
float score(const candidates_t& candidates) const {
|
2011-02-24 15:42:19 +03:00
|
|
|
diffs_t diffs;
|
|
|
|
statscores_t scores;
|
|
|
|
score(candidates, diffs, scores);
|
|
|
|
return scores[0];
|
|
|
|
}
|
2008-05-14 23:47:34 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
const string& getName() const {
|
2012-02-01 15:54:20 +04:00
|
|
|
return m_name;
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-05-14 16:23:58 +04:00
|
|
|
|
2011-11-12 05:58:14 +04:00
|
|
|
size_t getReferenceSize() const {
|
2012-02-01 15:54:20 +04:00
|
|
|
if (m_score_data) {
|
|
|
|
return m_score_data->size();
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2008-05-14 16:23:58 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Set the score data, prior to scoring.
|
|
|
|
*/
|
2011-02-24 15:42:19 +03:00
|
|
|
void setScoreData(ScoreData* data) {
|
2012-02-01 15:54:20 +04:00
|
|
|
m_score_data = data;
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
2012-02-01 16:19:25 +04:00
|
|
|
private:
|
|
|
|
class Encoder {
|
|
|
|
public:
|
|
|
|
Encoder();
|
|
|
|
virtual ~Encoder();
|
|
|
|
int Encode(const std::string& token);
|
|
|
|
void Clear() { m_vocab.clear(); }
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-02-01 16:19:25 +04:00
|
|
|
private:
|
|
|
|
std::map<std::string, int> m_vocab;
|
|
|
|
};
|
|
|
|
|
2012-02-01 16:26:47 +04:00
|
|
|
void InitConfig(const string& config);
|
|
|
|
|
2012-02-01 16:19:25 +04:00
|
|
|
string m_name;
|
|
|
|
Encoder* m_encoder;
|
|
|
|
map<string, string> m_config;
|
|
|
|
|
|
|
|
protected:
|
2012-02-01 15:54:20 +04:00
|
|
|
ScoreData* m_score_data;
|
|
|
|
bool m_enable_preserve_case;
|
2011-02-24 15:42:19 +03:00
|
|
|
|
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Get value of config variable. If not provided, return default.
|
|
|
|
*/
|
2011-11-12 05:58:14 +04:00
|
|
|
string getConfig(const string& key, const string& def="") const {
|
2012-02-01 15:54:20 +04:00
|
|
|
map<string,string>::const_iterator i = m_config.find(key);
|
|
|
|
if (i == m_config.end()) {
|
2011-02-24 15:42:19 +03:00
|
|
|
return def;
|
|
|
|
} else {
|
|
|
|
return i->second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Tokenise line and encode.
|
2011-11-12 03:58:23 +04:00
|
|
|
* Note: We assume that all tokens are separated by single spaces.
|
|
|
|
*/
|
2012-02-01 16:19:25 +04:00
|
|
|
void TokenizeAndEncode(const string& line, vector<int>& encoded);
|
2008-06-24 23:27:18 +04:00
|
|
|
|
2012-02-01 16:19:25 +04:00
|
|
|
void ClearEncoder() { m_encoder->Clear(); }
|
2008-05-14 12:46:15 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2008-05-15 18:48:11 +04:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Abstract base class for Scorers that work by adding statistics across all
|
|
|
|
* outout sentences, then apply some formula, e.g., BLEU, PER.
|
|
|
|
*/
|
2011-02-24 15:42:19 +03:00
|
|
|
class StatisticsBasedScorer : public Scorer
|
|
|
|
{
|
2012-02-01 15:58:49 +04:00
|
|
|
public:
|
2011-11-12 05:16:31 +04:00
|
|
|
StatisticsBasedScorer(const string& name, const string& config);
|
|
|
|
virtual ~StatisticsBasedScorer() {}
|
2011-02-24 15:42:19 +03:00
|
|
|
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
2011-11-12 05:58:14 +04:00
|
|
|
statscores_t& scores) const;
|
2008-05-15 18:48:11 +04:00
|
|
|
|
2012-02-01 15:58:49 +04:00
|
|
|
protected:
|
|
|
|
|
|
|
|
enum RegularisationType {
|
|
|
|
NONE,
|
|
|
|
AVERAGE,
|
|
|
|
MINIMUM,
|
|
|
|
};
|
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
/**
|
|
|
|
* Calculate the actual score.
|
|
|
|
*/
|
2011-11-12 05:40:54 +04:00
|
|
|
virtual statscore_t calculateScore(const vector<int>& totals) const = 0;
|
2008-05-15 18:48:11 +04:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// regularisation
|
2012-02-01 15:58:49 +04:00
|
|
|
RegularisationType m_regularization_type;
|
2012-02-01 15:54:20 +04:00
|
|
|
size_t m_regularization_window;
|
2008-05-15 18:48:11 +04:00
|
|
|
};
|
|
|
|
|
2012-02-20 04:46:08 +04:00
|
|
|
#endif // MERT_SCORER_H_
|