mosesdecoder/mert/Scorer.h
Tetsuo Kiso 8fdec9bf30 Use boost::unordered_map instead of std::map.
For storing the word vocabulary used in computation of
BLEU scores. This change will reduce the running time
of extractor about 2-3 seconds (9% reduction).
2012-12-07 05:12:24 +09:00

216 lines
5.1 KiB
C++

#ifndef MERT_SCORER_H_
#define MERT_SCORER_H_
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include <limits>
#include "Types.h"
#include "ScoreData.h"
namespace mert {
class Vocabulary;
} // namespace mert
namespace MosesTuning
{
class PreProcessFilter;
class ScoreStats;
enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};
/**
* Superclass of all scorers and dummy implementation.
*
* In order to add a new scorer it should be sufficient to override the members
* prepareStats(), setReferenceFiles() and score() (or calculateScore()).
*/
class Scorer
{
public:
Scorer(const std::string& name, const std::string& config);
virtual ~Scorer();
/**
* Return the number of statistics needed for the computation of the score.
*/
virtual std::size_t NumberOfScores() const = 0;
/**
* Set the reference files. This must be called before prepareStats().
*/
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles) {
// do nothing
}
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
*/
virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry) {
// do nothing.
}
virtual void prepareStats(const std::string& sindex, const std::string& text, ScoreStats& entry) {
this->prepareStats(static_cast<std::size_t>(atoi(sindex.c_str())), text, entry);
}
/**
* Score using each of the candidate index, then go through the diffs
* applying each in turn, and calculating a new score each time.
*/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) const = 0;
/*
{
//dummy impl
if (!m_score_data) {
throw runtime_error("score data not loaded");
}
scores.push_back(0);
for (std::size_t i = 0; i < diffs.size(); ++i) {
scores.push_back(0);
}
}
*/
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
*/
float score(const candidates_t& candidates) const;
const std::string& getName() const {
return m_name;
}
std::size_t getReferenceSize() const {
if (m_score_data) {
return m_score_data->size();
}
return 0;
}
/**
* Set the score data, prior to scoring.
*/
virtual void setScoreData(ScoreData* data) {
m_score_data = data;
}
/**
* The scorer returns if it uses the reference alignment data
* for permutation distance scores
**/
virtual bool useAlignment() const {
//cout << "Scorer::useAlignment returning false " << endl;
return false;
};
/**
* Set the factors, which should be used for this metric
*/
virtual void setFactors(const std::string& factors);
mert::Vocabulary* GetVocab() const { return m_vocab; }
/**
* Set unix filter, which will be used to preprocess the sentences
*/
virtual void setFilter(const std::string& filterCommand);
private:
void InitConfig(const std::string& config);
/**
* Take the factored sentence and return the desired factors
*/
std::string applyFactors(const std::string& sentece) const;
/**
* Preprocess the sentence with the filter (if given)
*/
std::string applyFilter(const std::string& sentence) const;
std::string m_name;
mert::Vocabulary* m_vocab;
std::map<std::string, std::string> m_config;
std::vector<int> m_factors;
PreProcessFilter* m_filter;
protected:
ScoreData* m_score_data;
bool m_enable_preserve_case;
/**
* Get value of config variable. If not provided, return default.
*/
std::string getConfig(const std::string& key, const std::string& def="") const {
std::map<std::string,std::string>::const_iterator i = m_config.find(key);
if (i == m_config.end()) {
return def;
} else {
return i->second;
}
}
/**
* Tokenise line and encode.
* Note: We assume that all tokens are separated by whitespaces.
*/
void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded);
/*
* Tokenize functions for testing only.
*/
void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded);
/**
* Every inherited scorer should call this function for each sentence
*/
std::string preprocessSentence(const std::string& sentence) const
{
return applyFactors(applyFilter(sentence));
}
};
namespace {
//regularisation strategies
inline float score_min(const statscores_t& scores, size_t start, size_t end)
{
float min = std::numeric_limits<float>::max();
for (size_t i = start; i < end; ++i) {
if (scores[i] < min) {
min = scores[i];
}
}
return min;
}
inline float score_average(const statscores_t& scores, size_t start, size_t end)
{
if ((end - start) < 1) {
// this shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
}
return total / (end - start);
}
} // namespace
}
#endif // MERT_SCORER_H_