mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 23:58:15 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
205c1a868b
@ -4,21 +4,90 @@
|
||||
#include <cmath>
|
||||
#include <climits>
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include "Util.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// configure regularisation
|
||||
const char KEY_REFLEN[] = "reflen";
|
||||
const char REFLEN_AVERAGE[] = "average";
|
||||
const char REFLEN_SHORTEST[] = "shortest";
|
||||
const char REFLEN_CLOSEST[] = "closest";
|
||||
|
||||
} // namespace
|
||||
|
||||
// A simple STL-map based n-gram counts.
|
||||
// Basically, we provide typical accessors and mutaors, but
|
||||
// we intentionally does not allow erasing elements.
|
||||
class BleuScorer::NgramCounts {
|
||||
public:
|
||||
// Used to construct the ngram map
|
||||
struct NgramComparator {
|
||||
bool operator()(const vector<int>& a, const vector<int>& b) const {
|
||||
size_t i;
|
||||
const size_t as = a.size();
|
||||
const size_t bs = b.size();
|
||||
for (i = 0; i < as && i < bs; ++i) {
|
||||
if (a[i] < b[i]) {
|
||||
return true;
|
||||
}
|
||||
if (a[i] > b[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// entries are equal, shortest wins
|
||||
return as < bs;
|
||||
}
|
||||
};
|
||||
|
||||
typedef vector<int> Key;
|
||||
typedef int Value;
|
||||
typedef map<Key, Value, NgramComparator>::iterator iterator;
|
||||
typedef map<Key, Value, NgramComparator>::const_iterator const_iterator;
|
||||
|
||||
NgramCounts() : kDefaultCount(1) { }
|
||||
virtual ~NgramCounts() { }
|
||||
|
||||
// If the specified "ngram" is found, we add counts.
|
||||
// If not, we insert the default count in the container.
|
||||
void add(const Key& ngram) {
|
||||
const_iterator it = find(ngram);
|
||||
if (it != end()) {
|
||||
m_counts[ngram] = it->second + 1;
|
||||
} else {
|
||||
m_counts[ngram] = kDefaultCount;
|
||||
}
|
||||
}
|
||||
|
||||
void clear() { m_counts.clear(); }
|
||||
|
||||
bool empty() const { return m_counts.empty(); }
|
||||
|
||||
size_t size() const { return m_counts.size(); }
|
||||
size_t max_size() const { return m_counts.max_size(); }
|
||||
|
||||
iterator find(const Key& ngram) { return m_counts.find(ngram); }
|
||||
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
|
||||
|
||||
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
|
||||
|
||||
iterator begin() { return m_counts.begin(); }
|
||||
const_iterator begin() const { return m_counts.begin(); }
|
||||
iterator end() { return m_counts.end(); }
|
||||
const_iterator end() const { return m_counts.end(); }
|
||||
|
||||
private:
|
||||
const int kDefaultCount;
|
||||
map<Key, Value, NgramComparator> m_counts;
|
||||
};
|
||||
|
||||
BleuScorer::BleuScorer(const string& config)
|
||||
: StatisticsBasedScorer("BLEU",config),
|
||||
: StatisticsBasedScorer("BLEU", config),
|
||||
kLENGTH(4),
|
||||
m_ref_length_type(CLOSEST) {
|
||||
//configure regularisation
|
||||
static string KEY_REFLEN = "reflen";
|
||||
static string REFLEN_AVERAGE = "average";
|
||||
static string REFLEN_SHORTEST = "shortest";
|
||||
static string REFLEN_CLOSEST = "closest";
|
||||
|
||||
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
|
||||
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
|
||||
if (reflen == REFLEN_AVERAGE) {
|
||||
m_ref_length_type = AVERAGE;
|
||||
} else if (reflen == REFLEN_SHORTEST) {
|
||||
@ -28,18 +97,15 @@ BleuScorer::BleuScorer(const string& config)
|
||||
} else {
|
||||
throw runtime_error("Unknown reference length strategy: " + reflen);
|
||||
}
|
||||
// cerr << "Using reference length strategy: " << reflen << endl;
|
||||
}
|
||||
|
||||
BleuScorer::~BleuScorer() {}
|
||||
|
||||
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
|
||||
size_t BleuScorer::countNgrams(const string& line, NgramCounts& counts,
|
||||
unsigned int n)
|
||||
{
|
||||
vector<int> encoded_tokens;
|
||||
//cerr << line << endl;
|
||||
TokenizeAndEncode(line, encoded_tokens);
|
||||
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
|
||||
//cerr << endl;
|
||||
for (size_t k = 1; k <= n; ++k) {
|
||||
//ngram order longer than sentence - no point
|
||||
if (k > encoded_tokens.size()) {
|
||||
@ -50,18 +116,9 @@ size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned in
|
||||
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
|
||||
ngram.push_back(encoded_tokens[j]);
|
||||
}
|
||||
int count = 1;
|
||||
counts_iterator oldcount = counts.find(ngram);
|
||||
if (oldcount != counts.end()) {
|
||||
count = (oldcount->second) + 1;
|
||||
}
|
||||
//cerr << count << endl;
|
||||
counts[ngram] = count;
|
||||
//cerr << endl;
|
||||
counts.add(ngram);
|
||||
}
|
||||
}
|
||||
//cerr << "counted ngrams" << endl;
|
||||
//dump_counts(counts);
|
||||
return encoded_tokens.size();
|
||||
}
|
||||
|
||||
@ -82,9 +139,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
string line;
|
||||
size_t sid = 0; //sentence counter
|
||||
while (getline(refin,line)) {
|
||||
//cerr << line << endl;
|
||||
if (i == 0) {
|
||||
counts_t *counts = new counts_t; //these get leaked
|
||||
NgramCounts *counts = new NgramCounts; //these get leaked
|
||||
m_ref_counts.push_back(counts);
|
||||
vector<size_t> lengths;
|
||||
m_ref_lengths.push_back(lengths);
|
||||
@ -92,11 +148,12 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
if (m_ref_counts.size() <= sid) {
|
||||
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
|
||||
}
|
||||
counts_t counts;
|
||||
size_t length = countNgrams(line,counts,kLENGTH);
|
||||
NgramCounts counts;
|
||||
size_t length = countNgrams(line, counts, kLENGTH);
|
||||
|
||||
//for any counts larger than those already there, merge them in
|
||||
for (counts_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
|
||||
counts_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
|
||||
for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
|
||||
NgramCounts::const_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
|
||||
int oldcount = 0;
|
||||
if (oldcount_it != m_ref_counts[sid]->end()) {
|
||||
oldcount = oldcount_it->second;
|
||||
@ -113,83 +170,55 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
}
|
||||
++sid;
|
||||
}
|
||||
TRACE_ERR(endl);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
// cerr << text << endl;
|
||||
// cerr << sid << endl;
|
||||
//dump_counts(*m_ref_counts[sid]);
|
||||
if (sid >= m_ref_counts.size()) {
|
||||
stringstream msg;
|
||||
msg << "Sentence id (" << sid << ") not found in reference set";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
counts_t testcounts;
|
||||
//stats for this line
|
||||
vector<float> stats(kLENGTH*2);;
|
||||
size_t length = countNgrams(text,testcounts,kLENGTH);
|
||||
//dump_counts(testcounts);
|
||||
if (m_ref_length_type == SHORTEST) {
|
||||
//cerr << reflengths.size() << " " << sid << endl;
|
||||
int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end());
|
||||
stats.push_back(shortest);
|
||||
} else if (m_ref_length_type == AVERAGE) {
|
||||
int total = 0;
|
||||
for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
|
||||
total += m_ref_lengths[sid][i];
|
||||
}
|
||||
const float mean = static_cast<float>(total) / m_ref_lengths[sid].size();
|
||||
stats.push_back(mean);
|
||||
} else if (m_ref_length_type == CLOSEST) {
|
||||
int min_diff = INT_MAX;
|
||||
int min_idx = 0;
|
||||
for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
|
||||
const int reflength = m_ref_lengths[sid][i];
|
||||
const int diff = reflength - static_cast<int>(length);
|
||||
const int absolute_diff = abs(diff) - abs(min_diff);
|
||||
NgramCounts testcounts;
|
||||
// stats for this line
|
||||
vector<ScoreStatsType> stats(kLENGTH * 2);;
|
||||
const size_t length = countNgrams(text, testcounts, kLENGTH);
|
||||
|
||||
if (absolute_diff < 0) { //look for the closest reference
|
||||
min_diff = diff;
|
||||
min_idx = i;
|
||||
} else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest
|
||||
if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) {
|
||||
min_idx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
stats.push_back(m_ref_lengths[sid][min_idx]);
|
||||
} else {
|
||||
throw runtime_error("Unsupported reflength strategy");
|
||||
// Calculate effective reference length.
|
||||
switch (m_ref_length_type) {
|
||||
case SHORTEST:
|
||||
CalcShortest(sid, stats);
|
||||
break;
|
||||
case AVERAGE:
|
||||
CalcAverage(sid, stats);
|
||||
break;
|
||||
case CLOSEST:
|
||||
CalcClosest(sid, length, stats);
|
||||
break;
|
||||
default:
|
||||
throw runtime_error("Unsupported reflength strategy");
|
||||
}
|
||||
//cerr << "computed length" << endl;
|
||||
|
||||
//precision on each ngram type
|
||||
for (counts_iterator testcounts_it = testcounts.begin();
|
||||
for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
|
||||
testcounts_it != testcounts.end(); ++testcounts_it) {
|
||||
counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
|
||||
NgramCounts::const_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
|
||||
int correct = 0;
|
||||
int guess = testcounts_it->second;
|
||||
const int guess = testcounts_it->second;
|
||||
if (refcounts_it != m_ref_counts[sid]->end()) {
|
||||
correct = min(refcounts_it->second,guess);
|
||||
}
|
||||
size_t len = testcounts_it->first.size();
|
||||
const size_t len = testcounts_it->first.size();
|
||||
stats[len*2-2] += correct;
|
||||
stats[len*2-1] += guess;
|
||||
}
|
||||
stringstream sout;
|
||||
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
|
||||
//TRACE_ERR(sout.str() << endl);
|
||||
string stats_str = sout.str();
|
||||
entry.set(stats_str);
|
||||
entry.set(stats);
|
||||
}
|
||||
|
||||
float BleuScorer::calculateScore(const vector<int>& comps) const
|
||||
{
|
||||
//cerr << "BLEU: ";
|
||||
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
|
||||
float logbleu = 0.0;
|
||||
for (int i = 0; i < kLENGTH; ++i) {
|
||||
if (comps[2*i] == 0) {
|
||||
@ -203,15 +232,64 @@ float BleuScorer::calculateScore(const vector<int>& comps) const
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
//cerr << " " << exp(logbleu) << endl;
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
void BleuScorer::dump_counts(counts_t& counts) const {
|
||||
for (counts_const_iterator i = counts.begin(); i != counts.end(); ++i) {
|
||||
cerr << "(";
|
||||
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
|
||||
cerr << ") " << i->second << ", ";
|
||||
void BleuScorer::dump_counts(ostream* os,
|
||||
const NgramCounts& counts) const {
|
||||
for (NgramCounts::const_iterator it = counts.begin();
|
||||
it != counts.end(); ++it) {
|
||||
*os << "(";
|
||||
const NgramCounts::Key& keys = it->first;
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
if (i != 0) {
|
||||
*os << " ";
|
||||
}
|
||||
*os << keys[i];
|
||||
}
|
||||
*os << ") : " << it->second << ", ";
|
||||
}
|
||||
cerr << endl;
|
||||
*os << endl;
|
||||
}
|
||||
|
||||
void BleuScorer::CalcAverage(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const {
|
||||
int total = 0;
|
||||
for (size_t i = 0;
|
||||
i < m_ref_lengths[sentence_id].size(); ++i) {
|
||||
total += m_ref_lengths[sentence_id][i];
|
||||
}
|
||||
const float mean = static_cast<float>(total) /
|
||||
m_ref_lengths[sentence_id].size();
|
||||
stats.push_back(static_cast<ScoreStatsType>(mean));
|
||||
}
|
||||
|
||||
void BleuScorer::CalcClosest(size_t sentence_id,
|
||||
size_t length,
|
||||
vector<ScoreStatsType>& stats) const {
|
||||
int min_diff = INT_MAX;
|
||||
int min_idx = 0;
|
||||
for (size_t i = 0; i < m_ref_lengths[sentence_id].size(); ++i) {
|
||||
const int reflength = m_ref_lengths[sentence_id][i];
|
||||
const int length_diff = abs(reflength - static_cast<int>(length));
|
||||
|
||||
// Look for the closest reference
|
||||
if (length_diff < abs(min_diff)) {
|
||||
min_diff = reflength - length;
|
||||
min_idx = i;
|
||||
// if two references has the same closest length, take the shortest
|
||||
} else if (length_diff == abs(min_diff)) {
|
||||
if (reflength < static_cast<int>(m_ref_lengths[sentence_id][min_idx])) {
|
||||
min_idx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
stats.push_back(m_ref_lengths[sentence_id][min_idx]);
|
||||
}
|
||||
|
||||
void BleuScorer::CalcShortest(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const {
|
||||
const int shortest = *min_element(m_ref_lengths[sentence_id].begin(),
|
||||
m_ref_lengths[sentence_id].end());
|
||||
stats.push_back(shortest);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef MERT_BLEU_SCORER_H_
|
||||
#define MERT_BLEU_SCORER_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -24,55 +24,42 @@ public:
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
return 2 * kLENGTH + 1;
|
||||
}
|
||||
virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; }
|
||||
|
||||
private:
|
||||
enum ReferenceLengthType {
|
||||
AVERAGE,
|
||||
SHORTEST,
|
||||
CLOSEST,
|
||||
CLOSEST
|
||||
};
|
||||
|
||||
//Used to construct the ngram map
|
||||
struct CompareNgrams {
|
||||
bool operator()(const vector<int>& a, const vector<int>& b) const {
|
||||
size_t i;
|
||||
const size_t as = a.size();
|
||||
const size_t bs = b.size();
|
||||
for (i = 0; i < as && i < bs; ++i) {
|
||||
if (a[i] < b[i]) {
|
||||
//cerr << "true" << endl;
|
||||
return true;
|
||||
}
|
||||
if (a[i] > b[i]) {
|
||||
//cerr << "false" << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
//entries are equal, shortest wins
|
||||
return as < bs;;
|
||||
}
|
||||
};
|
||||
|
||||
typedef map<vector<int>,int,CompareNgrams> counts_t;
|
||||
typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
|
||||
typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
|
||||
/**
|
||||
* A NgramCounts is a key-value store.
|
||||
* Clients don't have to worry about the actual implementation
|
||||
* since this type is used in internal only.
|
||||
*/
|
||||
class NgramCounts;
|
||||
|
||||
/**
|
||||
* Count the ngrams of each type, up to the given length in the input line.
|
||||
*/
|
||||
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
|
||||
size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n);
|
||||
|
||||
void dump_counts(counts_t& counts) const;
|
||||
void dump_counts(std::ostream* os, const NgramCounts& counts) const;
|
||||
|
||||
// For calculating effective reference length.
|
||||
void CalcAverage(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
void CalcClosest(size_t sentence_id, size_t length,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
void CalcShortest(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
|
||||
const int kLENGTH;
|
||||
ReferenceLengthType m_ref_length_type;
|
||||
|
||||
// data extracted from reference files
|
||||
ScopedVector<counts_t> m_ref_counts;
|
||||
ScopedVector<NgramCounts> m_ref_counts;
|
||||
vector<vector<size_t> > m_ref_lengths;
|
||||
|
||||
// no copying allowed
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "CderScorer.h"
|
||||
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
|
||||
@ -42,11 +42,7 @@ void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
vector<int> stats;
|
||||
prepareStatsVector(sid, text, stats);
|
||||
|
||||
stringstream sout;
|
||||
copy(stats.begin(), stats.end(), ostream_iterator<float>(sout," "));
|
||||
string stats_str = sout.str();
|
||||
entry.set(stats_str);
|
||||
entry.set(stats);
|
||||
}
|
||||
|
||||
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
|
||||
@ -55,9 +51,11 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
|
||||
TokenizeAndEncode(text, cand);
|
||||
|
||||
float max = -2;
|
||||
vector<int> tmp;
|
||||
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
|
||||
sent_t& ref = m_ref_sentences[rid][sid];
|
||||
vector<int> tmp = computeCD(cand, ref);
|
||||
const sent_t& ref = m_ref_sentences[rid][sid];
|
||||
tmp.clear();
|
||||
computeCD(cand, ref, tmp);
|
||||
if (calculateScore(tmp) > max) {
|
||||
stats = tmp;
|
||||
}
|
||||
@ -66,16 +64,14 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
|
||||
|
||||
float CderScorer::calculateScore(const vector<int>& comps) const
|
||||
{
|
||||
if (comps.size() != 2)
|
||||
{
|
||||
if (comps.size() != 2) {
|
||||
throw runtime_error("Size of stat vector for CDER is not 2");
|
||||
}
|
||||
|
||||
return 1 - (comps[0] / static_cast<float>(comps[1]));
|
||||
return 1.0f - (comps[0] / static_cast<float>(comps[1]));
|
||||
}
|
||||
|
||||
vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
|
||||
{
|
||||
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
|
||||
vector<int>& stats) const {
|
||||
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
|
||||
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
|
||||
|
||||
@ -113,10 +109,9 @@ vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
|
||||
row = nextRow;
|
||||
}
|
||||
|
||||
vector<int> stats(2);
|
||||
stats.resize(2);
|
||||
stats[0] = *(row->rbegin()); // CD distance is the cost of path from (0,0) to (I,L)
|
||||
stats[1] = ref.size();
|
||||
|
||||
delete row;
|
||||
return stats;
|
||||
}
|
||||
|
@ -1,8 +1,6 @@
|
||||
#ifndef MERT_CDER_SCORER_H_
|
||||
#define MERT_CDER_SCORER_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
@ -10,9 +8,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
class CderScorer: public StatisticsBasedScorer
|
||||
{
|
||||
public:
|
||||
class CderScorer: public StatisticsBasedScorer {
|
||||
public:
|
||||
explicit CderScorer(const string& config);
|
||||
~CderScorer();
|
||||
|
||||
@ -22,17 +19,16 @@ public:
|
||||
|
||||
virtual void prepareStatsVector(size_t sid, const string& text, vector<int>& stats);
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
return 2;
|
||||
}
|
||||
virtual size_t NumberOfScores() const { return 2; }
|
||||
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
|
||||
private:
|
||||
private:
|
||||
typedef vector<int> sent_t;
|
||||
vector<vector<sent_t> > m_ref_sentences;
|
||||
|
||||
vector<int> computeCD(const sent_t& cand, const sent_t& ref) const;
|
||||
void computeCD(const sent_t& cand, const sent_t& ref,
|
||||
vector<int>& stats) const;
|
||||
|
||||
// no copying allowed
|
||||
CderScorer(const CderScorer&);
|
||||
|
161
mert/InterpolatedScorer.cpp
Normal file
161
mert/InterpolatedScorer.cpp
Normal file
@ -0,0 +1,161 @@
|
||||
#include "ScorerFactory.h"
|
||||
#include "InterpolatedScorer.h"
|
||||
#include "Util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
|
||||
{
|
||||
|
||||
// name would be: HAMMING,BLEU or similar
|
||||
string scorers = name;
|
||||
while (scorers.length() > 0) {
|
||||
string scorertype = "";
|
||||
getNextPound(scorers,scorertype,",");
|
||||
Scorer *theScorer=ScorerFactory::getScorer(scorertype,config);
|
||||
_scorers.push_back(theScorer);
|
||||
}
|
||||
if (_scorers.size() == 0) {
|
||||
throw runtime_error("There are no scorers");
|
||||
}
|
||||
cerr << "Number of scorers: " << _scorers.size() << endl;
|
||||
|
||||
//TODO debug this
|
||||
string wtype = getConfig("weights","");
|
||||
//Default weights set to uniform ie. if two weights 0.5 each
|
||||
//weights should add to 1
|
||||
if (wtype.length() == 0) {
|
||||
float weight = 1.0/_scorers.size() ;
|
||||
//cout << " Default weights:" << weight << endl;
|
||||
for (size_t i = 0; i < _scorers.size(); i ++) {
|
||||
_scorerWeights.push_back(weight);
|
||||
}
|
||||
} else {
|
||||
float tot=0;
|
||||
//cout << "Defined weights:" << endl;
|
||||
while (wtype.length() > 0) {
|
||||
string scoreweight = "";
|
||||
getNextPound(wtype,scoreweight,"+");
|
||||
float weight = atof(scoreweight.c_str());
|
||||
_scorerWeights.push_back(weight);
|
||||
tot += weight;
|
||||
//cout << " :" << weight ;
|
||||
}
|
||||
//cout << endl;
|
||||
if (tot != float(1)) {
|
||||
for (vector<float>::iterator it = _scorerWeights.begin(); it != _scorerWeights.end(); ++it)
|
||||
{
|
||||
*it /= tot;
|
||||
}
|
||||
}
|
||||
|
||||
if (_scorers.size() != _scorerWeights.size()) {
|
||||
throw runtime_error("The number of weights does not equal the number of scorers!");
|
||||
}
|
||||
}
|
||||
cerr << "The weights for the interpolated scorers are: " << endl;
|
||||
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
|
||||
cerr << *it << " " ;
|
||||
}
|
||||
cerr <<endl;
|
||||
}
|
||||
|
||||
void InterpolatedScorer::setScoreData(ScoreData* data)
|
||||
{
|
||||
size_t last = 0;
|
||||
m_score_data = data;
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
int numScoresScorer = (*itsc)->NumberOfScores();
|
||||
ScoreData* newData =new ScoreData(**itsc);
|
||||
for (size_t i = 0; i < data->size(); i++) {
|
||||
ScoreArray scoreArray = data->get(i);
|
||||
ScoreArray newScoreArray;
|
||||
std::string istr;
|
||||
std::stringstream out;
|
||||
out << i;
|
||||
istr = out.str();
|
||||
size_t numNBest = scoreArray.size();
|
||||
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
|
||||
for (size_t j = 0; j < numNBest ; j++) {
|
||||
ScoreStats scoreStats = data->get(i, j);
|
||||
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
|
||||
ScoreStats newScoreStats;
|
||||
for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
|
||||
ScoreStatsType score = scoreStats.get(k);
|
||||
newScoreStats.add(score);
|
||||
}
|
||||
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
|
||||
newScoreArray.add(newScoreStats);
|
||||
}
|
||||
newScoreArray.setIndex(istr);
|
||||
newData->add(newScoreArray);
|
||||
}
|
||||
//newData->dump();
|
||||
|
||||
// NOTE: This class takes the ownership of the heap allocated
|
||||
// ScoreData objects to avoid the memory leak issues.
|
||||
m_scorers_score_data.push_back(newData);
|
||||
|
||||
(*itsc)->setScoreData(newData);
|
||||
last += numScoresScorer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** The interpolated scorer calls a vector of scorers and combines them with
|
||||
weights **/
|
||||
void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
statscores_t& scores) const
|
||||
{
|
||||
//cout << "*******InterpolatedScorer::score" << endl;
|
||||
size_t scorerNum = 0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
//int numScores = (*itsc)->NumberOfScores();
|
||||
statscores_t tscores;
|
||||
(*itsc)->score(candidates,diffs,tscores);
|
||||
size_t inc = 0;
|
||||
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
|
||||
//cout << "Scores " << (*itstatsc) << endl;
|
||||
float weight = _scorerWeights[scorerNum];
|
||||
if (weight == 0) {
|
||||
stringstream msg;
|
||||
msg << "No weights for scorer" << scorerNum ;
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
if (scorerNum == 0) {
|
||||
scores.push_back(weight * (*itstatsc));
|
||||
} else {
|
||||
scores[inc] += weight * (*itstatsc);
|
||||
}
|
||||
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
|
||||
inc++;
|
||||
|
||||
}
|
||||
scorerNum++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
{
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
(*itsc)->setReferenceFiles(referenceFiles);
|
||||
}
|
||||
}
|
||||
|
||||
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
stringstream buff;
|
||||
int i=0;
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
ScoreStats tempEntry;
|
||||
(*itsc)->prepareStats(sid, text, tempEntry);
|
||||
if (i > 0) buff << " ";
|
||||
buff << tempEntry;
|
||||
i++;
|
||||
}
|
||||
//cout << " Scores for interpolated: " << buff << endl;
|
||||
string str = buff.str();
|
||||
entry.set(str);
|
||||
}
|
55
mert/InterpolatedScorer.h
Normal file
55
mert/InterpolatedScorer.h
Normal file
@ -0,0 +1,55 @@
|
||||
#ifndef __INTERPOLATED_SCORER_H__
|
||||
#define __INTERPOLATED_SCORER_H__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
#include "ScoreData.h"
|
||||
#include "Scorer.h"
|
||||
#include "ScopedVector.h"
|
||||
|
||||
/**
|
||||
* Class that includes other scorers eg.
|
||||
* Interpolated HAMMING and BLEU scorer **/
|
||||
class InterpolatedScorer : public Scorer
|
||||
{
|
||||
public:
|
||||
// name would be: "HAMMING,BLEU" or similar
|
||||
InterpolatedScorer(const string& name, const string& config);
|
||||
virtual ~InterpolatedScorer() {}
|
||||
|
||||
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
statscores_t& scores) const;
|
||||
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
size_t sz=0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc != _scorers.end(); itsc++) {
|
||||
sz += (*itsc)->NumberOfScores();
|
||||
}
|
||||
return sz;
|
||||
};
|
||||
|
||||
virtual void setScoreData(ScoreData* data);
|
||||
|
||||
protected:
|
||||
ScopedVector<Scorer> _scorers;
|
||||
|
||||
// Take the ownership of the heap-allocated the objects
|
||||
// by Scorer objects.
|
||||
ScopedVector<ScoreData> m_scorers_score_data;
|
||||
|
||||
vector<float> _scorerWeights;
|
||||
};
|
||||
|
||||
#endif //__INTERPOLATED_SCORER_H
|
@ -12,6 +12,7 @@ FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
|
||||
FeatureDataIterator.cpp
|
||||
Data.cpp
|
||||
BleuScorer.cpp
|
||||
InterpolatedScorer.cpp
|
||||
Point.cpp
|
||||
PerScorer.cpp
|
||||
Scorer.cpp
|
||||
@ -44,6 +45,7 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
|
||||
alias programs : mert extractor evaluator pro ;
|
||||
|
||||
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test timer_test : TimerTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
|
||||
install legacy : programs : <location>. ;
|
||||
|
@ -24,6 +24,11 @@ public:
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void whoami() const {
|
||||
cerr << "I AM MergeScorer" << endl;
|
||||
}
|
||||
|
@ -72,7 +72,6 @@ statscore_t Optimizer::GetStatScore(const Point& param) const
|
||||
{
|
||||
vector<unsigned> bests;
|
||||
Get1bests(param, bests);
|
||||
//copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," "));
|
||||
statscore_t score = GetStatScore(bests);
|
||||
return score;
|
||||
}
|
||||
|
@ -24,12 +24,6 @@ ScoreStats::ScoreStats(const size_t size)
|
||||
memset(array_, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
|
||||
ScoreStats::ScoreStats(std::string &theString)
|
||||
: available_(0), entries_(0), array_(NULL)
|
||||
{
|
||||
set(theString);
|
||||
}
|
||||
|
||||
ScoreStats::~ScoreStats()
|
||||
{
|
||||
if (array_) {
|
||||
@ -73,14 +67,14 @@ void ScoreStats::add(ScoreStatsType v)
|
||||
array_[entries_++]=v;
|
||||
}
|
||||
|
||||
void ScoreStats::set(std::string &theString)
|
||||
void ScoreStats::set(const std::string& str)
|
||||
{
|
||||
std::string substring, stringBuf;
|
||||
reset();
|
||||
|
||||
while (!theString.empty()) {
|
||||
getNextPound(theString, substring);
|
||||
add(ConvertStringToScoreStatsType(substring));
|
||||
vector<string> out;
|
||||
Tokenize(str.c_str(), ' ', &out);
|
||||
for (vector<string>::const_iterator it = out.begin();
|
||||
it != out.end(); ++it) {
|
||||
add(ConvertStringToScoreStatsType(*it));
|
||||
}
|
||||
}
|
||||
|
||||
@ -144,7 +138,7 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
|
||||
if (s1.get(k) != s2.get(k))
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
//END_ADDED
|
||||
|
@ -31,7 +31,7 @@ private:
|
||||
public:
|
||||
ScoreStats();
|
||||
explicit ScoreStats(const size_t size);
|
||||
explicit ScoreStats(std::string &theString);
|
||||
|
||||
~ScoreStats();
|
||||
|
||||
// We intentionally allow copying.
|
||||
@ -66,7 +66,15 @@ public:
|
||||
return array_;
|
||||
}
|
||||
|
||||
void set(std::string &theString);
|
||||
void set(const std::string& str);
|
||||
|
||||
// Much more efficient than the above.
|
||||
void set(const std::vector<ScoreStatsType>& stats) {
|
||||
reset();
|
||||
for (size_t i = 0; i < stats.size(); ++i) {
|
||||
add(stats[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t bytes() const {
|
||||
return GetArraySizeWithBytes();
|
||||
|
@ -28,10 +28,7 @@ class Scorer
|
||||
/**
|
||||
* Return the number of statistics needed for the computation of the score.
|
||||
*/
|
||||
virtual size_t NumberOfScores() const {
|
||||
cerr << "Scorer: 0" << endl;
|
||||
return 0;
|
||||
}
|
||||
virtual size_t NumberOfScores() const = 0;
|
||||
|
||||
/**
|
||||
* Set the reference files. This must be called before prepareStats().
|
||||
@ -57,7 +54,9 @@ class Scorer
|
||||
* applying each in turn, and calculating a new score each time.
|
||||
*/
|
||||
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
statscores_t& scores) const {
|
||||
statscores_t& scores) const = 0;
|
||||
/*
|
||||
{
|
||||
//dummy impl
|
||||
if (!m_score_data) {
|
||||
throw runtime_error("score data not loaded");
|
||||
@ -67,6 +66,7 @@ class Scorer
|
||||
scores.push_back(0);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Calculate the score of the sentences corresponding to the list of candidate
|
||||
@ -93,7 +93,7 @@ class Scorer
|
||||
/**
|
||||
* Set the score data, prior to scoring.
|
||||
*/
|
||||
void setScoreData(ScoreData* data) {
|
||||
virtual void setScoreData(ScoreData* data) {
|
||||
m_score_data = data;
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "TerScorer.h"
|
||||
#include "CderScorer.h"
|
||||
#include "MergeScorer.h"
|
||||
#include "InterpolatedScorer.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -32,6 +33,11 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
|
||||
} else if (type == "MERGE") {
|
||||
return (MergeScorer*) new MergeScorer(config);
|
||||
} else {
|
||||
throw runtime_error("Unknown scorer type: " + type);
|
||||
if (type.find(',') != string::npos) {
|
||||
return new InterpolatedScorer(type, config);
|
||||
}
|
||||
else {
|
||||
throw runtime_error("Unknown scorer type: " + type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
135
mert/Timer.cpp
135
mert/Timer.cpp
@ -1,73 +1,106 @@
|
||||
#include "Timer.h"
|
||||
#include "Util.h"
|
||||
#include <cstdio>
|
||||
|
||||
double Timer::elapsed_time()
|
||||
{
|
||||
time_t now;
|
||||
time(&now);
|
||||
return difftime(now, start_time);
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
uint64_t GetMicroSeconds(const struct timeval& tv) {
|
||||
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
|
||||
}
|
||||
|
||||
double Timer::get_elapsed_time()
|
||||
{
|
||||
return elapsed_time();
|
||||
uint64_t GetTimeOfDayMicroSeconds() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
Timer::CPUTime Timer::GetCPUTimeMicroSeconds() const {
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
struct rusage usage;
|
||||
if (getrusage(RUSAGE_SELF, &usage)) {
|
||||
TRACE_ERR("Error occurred: getrusage().\n");
|
||||
exit(1);
|
||||
}
|
||||
CPUTime t;
|
||||
t.user_time = GetMicroSeconds(usage.ru_utime);
|
||||
t.sys_time = GetMicroSeconds(usage.ru_stime);
|
||||
return t;
|
||||
#else // Windows
|
||||
// Not implemented yet.
|
||||
// TODO: implement the Windows version using native APIs.
|
||||
CPUTime t;
|
||||
return t;
|
||||
#endif
|
||||
}
|
||||
|
||||
double Timer::get_elapsed_cpu_time() const {
|
||||
return static_cast<double>(get_elapsed_cpu_time_microseconds()) * 1e-6;
|
||||
}
|
||||
|
||||
uint64_t Timer::get_elapsed_cpu_time_microseconds() const {
|
||||
const CPUTime e = GetCPUTimeMicroSeconds();
|
||||
return (e.user_time - m_start_time.user_time) +
|
||||
(e.sys_time - m_start_time.sys_time);
|
||||
}
|
||||
|
||||
double Timer::get_elapsed_wall_time() const {
|
||||
return static_cast<double>(get_elapsed_wall_time_microseconds()) * 1e-6;
|
||||
}
|
||||
|
||||
uint64_t Timer::get_elapsed_wall_time_microseconds() const {
|
||||
return GetTimeOfDayMicroSeconds() - m_wall;
|
||||
}
|
||||
|
||||
void Timer::start(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Starting timer t";
|
||||
if (msg) TRACE_ERR( msg << std::endl);
|
||||
|
||||
// Return immediately if the timer is already running
|
||||
if (running) return;
|
||||
|
||||
// Change timer status to running
|
||||
running = true;
|
||||
|
||||
// Set the start time;
|
||||
time(&start_time);
|
||||
if (m_is_running) return;
|
||||
m_is_running = true;
|
||||
m_wall = GetTimeOfDayMicroSeconds();
|
||||
m_start_time = GetCPUTimeMicroSeconds();
|
||||
}
|
||||
|
||||
/***
|
||||
* Turn the timer off and start it again from 0. Print an optional message.
|
||||
*/
|
||||
/*
|
||||
inline void Timer::restart(const char* msg)
|
||||
void Timer::restart(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Restarting timer t";
|
||||
if (msg) TRACE_ERR( msg << std::endl;
|
||||
|
||||
// Set the timer status to running
|
||||
running = true;
|
||||
|
||||
// Set the accumulated time to 0 and the start time to now
|
||||
acc_time = 0;
|
||||
start_clock = clock();
|
||||
start_time = time(0);
|
||||
if (msg) {
|
||||
TRACE_ERR(msg << std::endl);
|
||||
}
|
||||
m_wall = GetTimeOfDayMicroSeconds();
|
||||
m_start_time = GetCPUTimeMicroSeconds();
|
||||
}
|
||||
*/
|
||||
|
||||
/***
|
||||
* Stop the timer and print an optional message.
|
||||
*/
|
||||
/*
|
||||
inline void Timer::stop(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Stopping timer t";
|
||||
check(msg);
|
||||
|
||||
// Recalculate and store the total accumulated time up until now
|
||||
if (running) acc_time += elapsed_time();
|
||||
|
||||
running = false;
|
||||
}
|
||||
*/
|
||||
|
||||
void Timer::check(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Checking timer t";
|
||||
if (msg) TRACE_ERR( msg << " : ");
|
||||
|
||||
// TRACE_ERR( "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
|
||||
TRACE_ERR( "[" << (running ? elapsed_time() : 0) << "] seconds\n");
|
||||
if (m_is_running) {
|
||||
TRACE_ERR("[Wall " << get_elapsed_wall_time()
|
||||
<< " CPU " << get_elapsed_cpu_time() << "] seconds.\n");
|
||||
} else {
|
||||
TRACE_ERR("WARNING: the timer is not running.\n");
|
||||
}
|
||||
}
|
||||
|
||||
std::string Timer::ToString() const {
|
||||
std::string res;
|
||||
char tmp[64];
|
||||
const double wall = get_elapsed_wall_time();
|
||||
const CPUTime e = GetCPUTimeMicroSeconds();
|
||||
const double utime = (e.user_time - m_start_time.user_time) * 1e-6;
|
||||
const double stime = (e.sys_time - m_start_time.sys_time) * 1e-6;
|
||||
std::snprintf(tmp, sizeof(tmp), "wall %f user %f sec. sys %f sec. total %f sec.",
|
||||
wall, utime, stime, utime + stime);
|
||||
res.append(tmp);
|
||||
return res;
|
||||
}
|
||||
|
106
mert/Timer.h
106
mert/Timer.h
@ -1,46 +1,50 @@
|
||||
#ifndef MERT_TIMER_H_
|
||||
#define MERT_TIMER_H_
|
||||
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
class Timer
|
||||
{
|
||||
/**
|
||||
* Allow timers to be printed to ostreams using the syntax 'os << t'
|
||||
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
|
||||
* print out the total amount of time 't' has been "running".
|
||||
*/
|
||||
friend std::ostream& operator<<(std::ostream& os, Timer& t);
|
||||
private:
|
||||
// Time values are stored in microseconds.
|
||||
struct CPUTime {
|
||||
uint64_t user_time; // user CPU time
|
||||
uint64_t sys_time; // system CPU time
|
||||
|
||||
private:
|
||||
bool running;
|
||||
time_t start_time;
|
||||
CPUTime() : user_time(0), sys_time(0) { }
|
||||
};
|
||||
|
||||
/**
|
||||
* Return the total time that the timer has been in the "running"
|
||||
* state since it was first "started" or last "restarted". For
|
||||
* "short" time periods (less than an hour), the actual cpu time
|
||||
* used is reported instead of the elapsed time.
|
||||
* TODO in seconds?
|
||||
*/
|
||||
double elapsed_time();
|
||||
CPUTime GetCPUTimeMicroSeconds() const;
|
||||
|
||||
public:
|
||||
bool m_is_running;
|
||||
uint64_t m_wall; // wall-clock time in microseconds
|
||||
CPUTime m_start_time;
|
||||
|
||||
public:
|
||||
/**
|
||||
* 'running' is initially false. A timer needs to be explicitly started
|
||||
* using 'start' or 'restart'.
|
||||
* 'm_is_running' is initially false. A timer needs to be explicitly started
|
||||
* using 'start'.
|
||||
*/
|
||||
Timer() : running(false), start_time(0) { }
|
||||
Timer()
|
||||
: m_is_running(false),
|
||||
m_wall(0),
|
||||
m_start_time() {}
|
||||
|
||||
~Timer() {}
|
||||
|
||||
/**
|
||||
* Start a timer. If it is already running, let it continue running.
|
||||
* Print an optional message.
|
||||
*/
|
||||
void start(const char* msg = 0);
|
||||
// void restart(const char* msg = 0);
|
||||
// void stop(const char* msg = 0);
|
||||
|
||||
/**
|
||||
* Restart the timer iff the timer is already running.
|
||||
* if the timer is not running, just start the timer.
|
||||
*/
|
||||
void restart(const char* msg = 0);
|
||||
|
||||
/**
|
||||
* Print out an optional message followed by the current timer timing.
|
||||
@ -48,19 +52,49 @@ public:
|
||||
void check(const char* msg = 0);
|
||||
|
||||
/**
|
||||
* Return the total time that the timer has been in the "running"
|
||||
* state since it was first "started" or last "restarted". For
|
||||
* "short" time periods (less than an hour), the actual cpu time
|
||||
* used is reported instead of the elapsed time.
|
||||
* This function is the public version of elapsed_time()
|
||||
*/
|
||||
double get_elapsed_time();
|
||||
bool is_running() const { return m_is_running; }
|
||||
|
||||
/**
|
||||
* Return the total time in seconds that the timer has been in the
|
||||
* "running" state since it was first "started" or last "restarted".
|
||||
* For "short" time periods (less than an hour), the actual cpu time
|
||||
* used is reported instead of the elapsed time.
|
||||
*/
|
||||
double get_elapsed_cpu_time() const;
|
||||
|
||||
/**
|
||||
* Return the total time in microseconds.
|
||||
*/
|
||||
uint64_t get_elapsed_cpu_time_microseconds() const;
|
||||
|
||||
/**
|
||||
* Get elapsed wall-clock time in seconds.
|
||||
*/
|
||||
double get_elapsed_wall_time() const;
|
||||
|
||||
/**
|
||||
* Get elapsed wall-clock time in microseconds.
|
||||
*/
|
||||
uint64_t get_elapsed_wall_time_microseconds() const;
|
||||
|
||||
/**
|
||||
* Return a string that has the user CPU time, system time, and total time.
|
||||
*/
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, Timer& t)
|
||||
{
|
||||
//os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0);
|
||||
os << (t.running ? t.elapsed_time() : 0);
|
||||
/**
|
||||
* Allow timers to be printed to ostreams using the syntax 'os << t'
|
||||
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
|
||||
* print out the total amount of time 't' has been "running".
|
||||
*/
|
||||
inline std::ostream& operator<<(std::ostream& os, const Timer& t) {
|
||||
if (t.is_running()) {
|
||||
os << t.ToString();
|
||||
} else {
|
||||
os << "timer is not running.";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
|
27
mert/TimerTest.cpp
Normal file
27
mert/TimerTest.cpp
Normal file
@ -0,0 +1,27 @@
|
||||
#include "Timer.h"
|
||||
|
||||
#define BOOST_TEST_MODULE TimerTest
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
BOOST_AUTO_TEST_CASE(timer_basic_test) {
|
||||
Timer timer;
|
||||
timer.start();
|
||||
BOOST_REQUIRE(timer.is_running());
|
||||
BOOST_REQUIRE(timer.get_elapsed_cpu_time() > 0.0);
|
||||
BOOST_REQUIRE(timer.get_elapsed_cpu_time_microseconds() > 0);
|
||||
BOOST_REQUIRE(timer.get_elapsed_wall_time() > 0.0);
|
||||
BOOST_REQUIRE(timer.get_elapsed_wall_time_microseconds() > 0);
|
||||
|
||||
timer.restart();
|
||||
BOOST_REQUIRE(timer.is_running());
|
||||
BOOST_REQUIRE(timer.get_elapsed_cpu_time() > 0.0);
|
||||
BOOST_REQUIRE(timer.get_elapsed_cpu_time_microseconds() > 0);
|
||||
BOOST_REQUIRE(timer.get_elapsed_wall_time() > 0.0);
|
||||
BOOST_REQUIRE(timer.get_elapsed_wall_time_microseconds() > 0);
|
||||
|
||||
const std::string s = timer.ToString();
|
||||
BOOST_REQUIRE(!s.empty());
|
||||
}
|
@ -84,5 +84,5 @@ void PrintUserTime(const std::string &message)
|
||||
|
||||
double GetUserTime()
|
||||
{
|
||||
return g_timer.get_elapsed_time();
|
||||
return g_timer.get_elapsed_cpu_time();
|
||||
}
|
||||
|
@ -138,10 +138,19 @@ void usage()
|
||||
cerr << "[--help|-h] print this message and exit" << endl;
|
||||
cerr << endl;
|
||||
cerr << "Evaluator is able to compute more metrics at once. To do this," << endl;
|
||||
cerr << "separate scorers with semicolon (note that comma is used to separate" << endl;
|
||||
cerr << "scorers in the interpolated scorer)." << endl;
|
||||
cerr << "specify more --sctype arguments. You can also specify more --scconfig strings." << endl;
|
||||
cerr << endl;
|
||||
cerr << "If you specify only one metric and one candidate file, only the final score" << endl;
|
||||
cerr << "The example below prints BLEU score, PER score and interpolated" << endl;
|
||||
cerr << "score of CDER and PER with the given weights." << endl;
|
||||
cerr << endl;
|
||||
cerr << "./evaluator \\" << endl;
|
||||
cerr << "\t--sctype BLEU --scconfig reflen:closest \\" << endl;
|
||||
cerr << "\t--sctype PER \\" << endl;
|
||||
cerr << "\t--sctype CDER,PER --scconfig weights:0.25+0.75 \\" << endl;
|
||||
cerr << "\t--candidate CANDIDATE \\" << endl;
|
||||
cerr << "\t--reference REFERENCE" << endl;
|
||||
cerr << endl;
|
||||
cerr << "If you specify only one scorer and one candidate file, only the final score" << endl;
|
||||
cerr << "will be printed to stdout. Otherwise each line will contain metric name" << endl;
|
||||
cerr << "and/or filename and the final score. Since most of the metrics prints some" << endl;
|
||||
cerr << "debuging info, consider redirecting stderr to /dev/null." << endl;
|
||||
@ -161,8 +170,8 @@ static struct option long_options[] = {
|
||||
|
||||
// Options used in evaluator.
|
||||
struct ProgramOption {
|
||||
string scorer_type;
|
||||
string scorer_config;
|
||||
vector<string> scorer_types;
|
||||
vector<string> scorer_configs;
|
||||
string reference;
|
||||
string candidate;
|
||||
int bootstrap;
|
||||
@ -170,9 +179,7 @@ struct ProgramOption {
|
||||
bool has_seed;
|
||||
|
||||
ProgramOption()
|
||||
: scorer_type("BLEU"),
|
||||
scorer_config(""),
|
||||
reference(""),
|
||||
: reference(""),
|
||||
candidate(""),
|
||||
bootstrap(0),
|
||||
seed(0),
|
||||
@ -182,13 +189,16 @@ struct ProgramOption {
|
||||
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
int c;
|
||||
int option_index;
|
||||
int last_scorer_index = -1;
|
||||
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
|
||||
switch(c) {
|
||||
case 's':
|
||||
opt->scorer_type = string(optarg);
|
||||
opt->scorer_types.push_back(string(optarg));
|
||||
opt->scorer_configs.push_back(string(""));
|
||||
last_scorer_index++;
|
||||
break;
|
||||
case 'c':
|
||||
opt->scorer_config = string(optarg);
|
||||
opt->scorer_configs[last_scorer_index] = string(optarg);
|
||||
break;
|
||||
case 'R':
|
||||
opt->reference = string(optarg);
|
||||
@ -207,6 +217,13 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
usage();
|
||||
}
|
||||
}
|
||||
|
||||
// Add default scorer if no scorer provided
|
||||
if (opt->scorer_types.size() == 0)
|
||||
{
|
||||
opt->scorer_types.push_back(string("BLEU"));
|
||||
opt->scorer_configs.push_back(string(""));
|
||||
}
|
||||
}
|
||||
|
||||
void InitSeed(const ProgramOption *opt) {
|
||||
@ -236,7 +253,6 @@ int main(int argc, char** argv)
|
||||
try {
|
||||
vector<string> refFiles;
|
||||
vector<string> candFiles;
|
||||
vector<string> scorerTypes;
|
||||
|
||||
if (option.reference.length() == 0) throw runtime_error("You have to specify at least one reference file.");
|
||||
split(option.reference, ',', refFiles);
|
||||
@ -244,17 +260,14 @@ int main(int argc, char** argv)
|
||||
if (option.candidate.length() == 0) throw runtime_error("You have to specify at least one candidate file.");
|
||||
split(option.candidate, ',', candFiles);
|
||||
|
||||
if (option.scorer_type.length() == 0) throw runtime_error("You have to specify at least one scorer.");
|
||||
split(option.scorer_type, ';', scorerTypes);
|
||||
|
||||
if (candFiles.size() > 1) g_has_more_files = true;
|
||||
if (scorerTypes.size() > 1) g_has_more_scorers = true;
|
||||
if (option.scorer_types.size() > 1) g_has_more_scorers = true;
|
||||
|
||||
for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt)
|
||||
{
|
||||
for (vector<string>::const_iterator scorerIt = scorerTypes.begin(); scorerIt != scorerTypes.end(); ++scorerIt)
|
||||
for (size_t i = 0; i < option.scorer_types.size(); i++)
|
||||
{
|
||||
g_scorer = ScorerFactory::getScorer(*scorerIt, option.scorer_config);
|
||||
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
|
||||
g_scorer->setReferenceFiles(refFiles);
|
||||
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
|
||||
delete g_scorer;
|
||||
|
@ -206,16 +206,9 @@ int main(int argc, char** argv)
|
||||
data.remove_duplicates();
|
||||
//END_ADDED
|
||||
|
||||
if (option.binmode)
|
||||
cerr << "Binary write mode is selected" << endl;
|
||||
else
|
||||
cerr << "Binary write mode is NOT selected" << endl;
|
||||
|
||||
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
|
||||
PrintUserTime("Stopping...");
|
||||
|
||||
// timer.stop("Stopping...");
|
||||
|
||||
delete scorer;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -181,7 +181,7 @@ if ($opt_hierarchical)
|
||||
my %PHRASE_USED;
|
||||
if (!$opt_hierarchical) {
|
||||
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
|
||||
open(INPUT,$input) or die "Can't read $input";
|
||||
open(INPUT,mk_open_string($input)) or die "Can't read $input";
|
||||
while(my $line = <INPUT>) {
|
||||
chomp($line);
|
||||
my @WORD = split(/ +/,$line);
|
||||
@ -207,6 +207,22 @@ if (!$opt_hierarchical) {
|
||||
close(INPUT);
|
||||
}
|
||||
|
||||
sub mk_open_string {
|
||||
my $file = shift;
|
||||
my $openstring;
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$openstring = "$ZCAT $file.gz |";
|
||||
} elsif ($file =~ /\.gz$/) {
|
||||
$openstring = "$ZCAT $file |";
|
||||
} elsif ($opt_hierarchical) {
|
||||
$openstring = "cat $file |";
|
||||
} else {
|
||||
$openstring = "< $file";
|
||||
}
|
||||
return $openstring;
|
||||
}
|
||||
|
||||
|
||||
# filter files
|
||||
for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
my ($used,$total) = (0,0);
|
||||
@ -215,16 +231,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
my $new_file = $TABLE_NEW_NAME[$i];
|
||||
print STDERR "filtering $file -> $new_file...\n";
|
||||
|
||||
my $openstring;
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$openstring = "$ZCAT $file.gz |";
|
||||
} elsif ($file =~ /\.gz$/) {
|
||||
$openstring = "$ZCAT $file |";
|
||||
} elsif ($opt_hierarchical) {
|
||||
$openstring = "cat $file |";
|
||||
} else {
|
||||
$openstring = "< $file";
|
||||
}
|
||||
my $openstring = mk_open_string($file);
|
||||
|
||||
my $new_openstring;
|
||||
if ($new_file =~ /\.gz$/) {
|
||||
@ -303,7 +310,7 @@ close(INFO);
|
||||
|
||||
|
||||
print "To run the decoder, please call:
|
||||
moses -f $dir/moses.ini < $input\n";
|
||||
moses -f $dir/moses.ini -i $input\n";
|
||||
|
||||
sub safesystem {
|
||||
print STDERR "Executing: @_\n";
|
||||
|
Loading…
Reference in New Issue
Block a user