mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
457d004368
example: Use --factor "0|2" to use only first and third factor from nbest list and from reference. If you use interpolated scorer, separate records with comma (e.g. --factor "0|2,1").
254 lines
7.1 KiB
C++
254 lines
7.1 KiB
C++
#include "Scorer.h"
|
|
#include <limits>
|
|
#include "Util.h"
|
|
|
|
namespace {
|
|
|
|
//regularisation strategies
|
|
inline float score_min(const statscores_t& scores, size_t start, size_t end)
|
|
{
|
|
float min = numeric_limits<float>::max();
|
|
for (size_t i = start; i < end; ++i) {
|
|
if (scores[i] < min) {
|
|
min = scores[i];
|
|
}
|
|
}
|
|
return min;
|
|
}
|
|
|
|
inline float score_average(const statscores_t& scores, size_t start, size_t end)
|
|
{
|
|
if ((end - start) < 1) {
|
|
// this shouldn't happen
|
|
return 0;
|
|
}
|
|
float total = 0;
|
|
for (size_t j = start; j < end; ++j) {
|
|
total += scores[j];
|
|
}
|
|
|
|
return total / (end - start);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
Scorer::Scorer(const string& name, const string& config)
|
|
: m_name(name),
|
|
m_encoder(new Encoder),
|
|
m_score_data(0),
|
|
m_enable_preserve_case(true) {
|
|
InitConfig(config);
|
|
}
|
|
|
|
Scorer::~Scorer() {
|
|
delete m_encoder;
|
|
}
|
|
|
|
void Scorer::InitConfig(const string& config) {
|
|
// cerr << "Scorer config string: " << config << endl;
|
|
size_t start = 0;
|
|
while (start < config.size()) {
|
|
size_t end = config.find(",", start);
|
|
if (end == string::npos) {
|
|
end = config.size();
|
|
}
|
|
string nv = config.substr(start, end - start);
|
|
size_t split = nv.find(":");
|
|
if (split == string::npos) {
|
|
throw runtime_error("Missing colon when processing scorer config: " + config);
|
|
}
|
|
const string name = nv.substr(0, split);
|
|
const string value = nv.substr(split + 1, nv.size() - split - 1);
|
|
cerr << "name: " << name << " value: " << value << endl;
|
|
m_config[name] = value;
|
|
start = end + 1;
|
|
}
|
|
}
|
|
|
|
Scorer::Encoder::Encoder() {}
|
|
|
|
Scorer::Encoder::~Encoder() {}
|
|
|
|
int Scorer::Encoder::Encode(const string& token) {
|
|
map<string, int>::iterator it = m_vocab.find(token);
|
|
int encoded_token;
|
|
if (it == m_vocab.end()) {
|
|
// Add an new entry to the vocaburary.
|
|
encoded_token = static_cast<int>(m_vocab.size());
|
|
m_vocab[token] = encoded_token;
|
|
} else {
|
|
encoded_token = it->second;
|
|
}
|
|
return encoded_token;
|
|
}
|
|
|
|
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
|
|
std::istringstream in(line);
|
|
std::string token;
|
|
while (in >> token) {
|
|
if (!m_enable_preserve_case) {
|
|
for (std::string::iterator it = token.begin();
|
|
it != token.end(); ++it) {
|
|
*it = tolower(*it);
|
|
}
|
|
}
|
|
encoded.push_back(m_encoder->Encode(token));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set the factors, which should be used for this metric
|
|
*/
|
|
void Scorer::setFactors(const string& factors)
|
|
{
|
|
if (factors.empty()) return;
|
|
vector<string> factors_vec;
|
|
split(factors, '|', factors_vec);
|
|
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
|
|
{
|
|
int factor = atoi(it->c_str());
|
|
m_factors.push_back(factor);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Take the factored sentence and return the desired factors
|
|
*/
|
|
string Scorer::applyFactors(const string& sentence)
|
|
{
|
|
if (m_factors.size() == 0) return sentence;
|
|
|
|
vector<string> tokens;
|
|
split(sentence, ' ', tokens);
|
|
|
|
stringstream sstream;
|
|
for (size_t i = 0; i < tokens.size(); ++i)
|
|
{
|
|
if (tokens[i] == "") continue;
|
|
|
|
vector<string> factors;
|
|
split(tokens[i], '|', factors);
|
|
|
|
int fsize = factors.size();
|
|
|
|
if (i>0) sstream << " ";
|
|
|
|
for (size_t j = 0; j < m_factors.size(); ++j)
|
|
{
|
|
int findex = m_factors[j];
|
|
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
|
|
|
|
if (j>0) sstream << "|";
|
|
sstream << factors[findex];
|
|
}
|
|
}
|
|
return sstream.str();
|
|
}
|
|
|
|
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
|
|
: Scorer(name,config) {
|
|
//configure regularisation
|
|
static string KEY_TYPE = "regtype";
|
|
static string KEY_WINDOW = "regwin";
|
|
static string KEY_CASE = "case";
|
|
static string TYPE_NONE = "none";
|
|
static string TYPE_AVERAGE = "average";
|
|
static string TYPE_MINIMUM = "min";
|
|
static string TRUE = "true";
|
|
static string FALSE = "false";
|
|
|
|
string type = getConfig(KEY_TYPE,TYPE_NONE);
|
|
if (type == TYPE_NONE) {
|
|
m_regularization_type = NONE;
|
|
} else if (type == TYPE_AVERAGE) {
|
|
m_regularization_type = AVERAGE;
|
|
} else if (type == TYPE_MINIMUM) {
|
|
m_regularization_type = MINIMUM;
|
|
} else {
|
|
throw runtime_error("Unknown scorer regularisation strategy: " + type);
|
|
}
|
|
// cerr << "Using scorer regularisation strategy: " << type << endl;
|
|
|
|
const string& window = getConfig(KEY_WINDOW, "0");
|
|
m_regularization_window = atoi(window.c_str());
|
|
// cerr << "Using scorer regularisation window: " << m_regularization_window << endl;
|
|
|
|
const string& preserve_case = getConfig(KEY_CASE,TRUE);
|
|
if (preserve_case == TRUE) {
|
|
m_enable_preserve_case = true;
|
|
} else if (preserve_case == FALSE) {
|
|
m_enable_preserve_case = false;
|
|
}
|
|
// cerr << "Using case preservation: " << m_enable_preserve_case << endl;
|
|
}
|
|
|
|
void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
|
|
statscores_t& scores) const
|
|
{
|
|
if (!m_score_data) {
|
|
throw runtime_error("Score data not loaded");
|
|
}
|
|
// calculate the score for the candidates
|
|
if (m_score_data->size() == 0) {
|
|
throw runtime_error("Score data is empty");
|
|
}
|
|
if (candidates.size() == 0) {
|
|
throw runtime_error("No candidates supplied");
|
|
}
|
|
int numCounts = m_score_data->get(0,candidates[0]).size();
|
|
vector<int> totals(numCounts);
|
|
for (size_t i = 0; i < candidates.size(); ++i) {
|
|
ScoreStats stats = m_score_data->get(i,candidates[i]);
|
|
if (stats.size() != totals.size()) {
|
|
stringstream msg;
|
|
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
|
|
<< "number of fields. Found: " << stats.size() << " Expected: "
|
|
<< totals.size();
|
|
throw runtime_error(msg.str());
|
|
}
|
|
for (size_t k = 0; k < totals.size(); ++k) {
|
|
totals[k] += stats.get(k);
|
|
}
|
|
}
|
|
scores.push_back(calculateScore(totals));
|
|
|
|
candidates_t last_candidates(candidates);
|
|
// apply each of the diffs, and get new scores
|
|
for (size_t i = 0; i < diffs.size(); ++i) {
|
|
for (size_t j = 0; j < diffs[i].size(); ++j) {
|
|
size_t sid = diffs[i][j].first;
|
|
size_t nid = diffs[i][j].second;
|
|
size_t last_nid = last_candidates[sid];
|
|
for (size_t k = 0; k < totals.size(); ++k) {
|
|
int diff = m_score_data->get(sid,nid).get(k)
|
|
- m_score_data->get(sid,last_nid).get(k);
|
|
totals[k] += diff;
|
|
}
|
|
last_candidates[sid] = nid;
|
|
}
|
|
scores.push_back(calculateScore(totals));
|
|
}
|
|
|
|
// Regularisation. This can either be none, or the min or average as described in
|
|
// Cer, Jurafsky and Manning at WMT08.
|
|
if (m_regularization_type == NONE || m_regularization_window <= 0) {
|
|
// no regularisation
|
|
return;
|
|
}
|
|
|
|
// window size specifies the +/- in each direction
|
|
statscores_t raw_scores(scores); // copy scores
|
|
for (size_t i = 0; i < scores.size(); ++i) {
|
|
size_t start = 0;
|
|
if (i >= m_regularization_window) {
|
|
start = i - m_regularization_window;
|
|
}
|
|
const size_t end = min(scores.size(), i + m_regularization_window + 1);
|
|
if (m_regularization_type == AVERAGE) {
|
|
scores[i] = score_average(raw_scores,start,end);
|
|
} else {
|
|
scores[i] = score_min(raw_scores,start,end);
|
|
}
|
|
}
|
|
}
|