2008-05-27 20:50:52 +04:00
|
|
|
#include "PerScorer.h"
|
2008-05-15 00:36:11 +04:00
|
|
|
|
2011-11-14 10:15:30 +04:00
|
|
|
#include <fstream>
|
|
|
|
#include <stdexcept>
|
|
|
|
|
|
|
|
#include "ScoreStats.h"
|
|
|
|
#include "Util.h"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
namespace MosesTuning
|
|
|
|
{
|
|
|
|
|
|
|
|
|
2011-11-12 05:16:31 +04:00
|
|
|
PerScorer::PerScorer(const string& config)
|
|
|
|
: StatisticsBasedScorer("PER",config) {}
|
|
|
|
|
|
|
|
PerScorer::~PerScorer() {}
|
2008-05-15 00:36:11 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
|
|
|
{
|
2011-11-12 03:58:23 +04:00
|
|
|
// For each line in the reference file, create a multiset of
|
|
|
|
// the word ids.
|
2011-02-24 15:42:19 +03:00
|
|
|
if (referenceFiles.size() != 1) {
|
|
|
|
throw runtime_error("PER only supports a single reference");
|
|
|
|
}
|
2012-02-01 15:54:20 +04:00
|
|
|
m_ref_tokens.clear();
|
|
|
|
m_ref_lengths.clear();
|
2011-02-24 15:42:19 +03:00
|
|
|
ifstream in(referenceFiles[0].c_str());
|
|
|
|
if (!in) {
|
|
|
|
throw runtime_error("Unable to open " + referenceFiles[0]);
|
|
|
|
}
|
|
|
|
string line;
|
|
|
|
int sid = 0;
|
|
|
|
while (getline(in,line)) {
|
2012-05-09 21:21:41 +04:00
|
|
|
line = this->preprocessSentence(line);
|
2011-02-24 15:42:19 +03:00
|
|
|
vector<int> tokens;
|
2012-02-01 16:19:25 +04:00
|
|
|
TokenizeAndEncode(line, tokens);
|
2012-02-01 15:54:20 +04:00
|
|
|
m_ref_tokens.push_back(multiset<int>());
|
2011-02-24 15:42:19 +03:00
|
|
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
2012-02-01 15:54:20 +04:00
|
|
|
m_ref_tokens.back().insert(tokens[i]);
|
2008-05-15 18:13:32 +04:00
|
|
|
}
|
2012-02-01 15:54:20 +04:00
|
|
|
m_ref_lengths.push_back(tokens.size());
|
2011-02-24 15:42:19 +03:00
|
|
|
if (sid > 0 && sid % 100 == 0) {
|
|
|
|
TRACE_ERR(".");
|
2008-05-15 18:13:32 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
++sid;
|
|
|
|
}
|
|
|
|
TRACE_ERR(endl);
|
2008-05-15 18:13:32 +04:00
|
|
|
|
2008-05-15 00:36:11 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
|
|
|
{
|
2012-02-01 15:54:20 +04:00
|
|
|
if (sid >= m_ref_lengths.size()) {
|
2011-02-24 15:42:19 +03:00
|
|
|
stringstream msg;
|
|
|
|
msg << "Sentence id (" << sid << ") not found in reference set";
|
|
|
|
throw runtime_error(msg.str());
|
|
|
|
}
|
2012-02-28 05:27:23 +04:00
|
|
|
|
2012-05-09 21:21:41 +04:00
|
|
|
string sentence = this->preprocessSentence(text);
|
2012-02-28 05:27:23 +04:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// Calculate correct, output_length and ref_length for
|
|
|
|
// the line and store it in entry
|
2011-02-24 15:42:19 +03:00
|
|
|
vector<int> testtokens;
|
2012-02-28 05:27:23 +04:00
|
|
|
TokenizeAndEncode(sentence, testtokens);
|
2011-02-24 15:42:19 +03:00
|
|
|
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
|
|
|
|
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
|
|
|
|
int correct = 0;
|
|
|
|
for (set<int>::iterator i = testtokens_unique.begin();
|
|
|
|
i != testtokens_unique.end(); ++i) {
|
|
|
|
int token = *i;
|
2012-02-01 15:54:20 +04:00
|
|
|
correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token));
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
ostringstream stats;
|
2012-02-01 15:54:20 +04:00
|
|
|
stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ;
|
2011-02-24 15:42:19 +03:00
|
|
|
string stats_str = stats.str();
|
|
|
|
entry.set(stats_str);
|
2008-05-15 18:13:32 +04:00
|
|
|
}
|
|
|
|
|
2011-11-12 05:40:54 +04:00
|
|
|
float PerScorer::calculateScore(const vector<int>& comps) const
|
2011-02-24 15:42:19 +03:00
|
|
|
{
|
|
|
|
float denom = comps[2];
|
|
|
|
float num = comps[0] - max(0,comps[1]-comps[2]);
|
|
|
|
if (denom == 0) {
|
2011-11-12 03:58:23 +04:00
|
|
|
// This shouldn't happen!
|
2011-02-24 15:42:19 +03:00
|
|
|
return 0.0;
|
|
|
|
} else {
|
|
|
|
return num/denom;
|
|
|
|
}
|
2008-05-15 00:36:11 +04:00
|
|
|
}
|
2012-06-30 23:23:45 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|