Merge pull request #10 from moses-smt/apply-coding-standard

Clean up SemposScorer and Overlapping classes
This commit is contained in:
Matous Machacek 2012-03-23 11:12:56 -07:00
commit bbe9a06d62
5 changed files with 258 additions and 207 deletions

View File

@ -18,6 +18,7 @@ FeatureDataIterator.cpp
Data.cpp
BleuScorer.cpp
SemposScorer.cpp
SemposOverlapping.cpp
InterpolatedScorer.cpp
Point.cpp
PerScorer.cpp

View File

@ -0,0 +1,91 @@
#include "SemposOverlapping.h"
#include <algorithm>
#include <stdexcept>
using namespace std;
namespace {
SemposOverlapping* g_overlapping = NULL;
} // namespace
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str) {
if (str == "cap-micro") {
return new CapMicroOverlapping;
} else if (str == "cap-macro") {
return new CapMacroOverlapping;
} else {
throw runtime_error("Unknown overlapping: " + str);
}
}
void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) {
g_overlapping = ovr;
}
vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
{
vector<int> stats(2);
sentence_t intersection;
set_intersection(cand.begin(), cand.end(), ref.begin(), ref.end(),
inserter(intersection, intersection.begin()));
stats[0] = static_cast<int>(intersection.size());
stats[1] = static_cast<int>(ref.size());
return stats;
}
float CapMicroOverlapping::calculateScore(const vector<int>& stats) const
{
if (stats.size() != 2)
{
throw std::runtime_error("Size of stats vector has to be 2");
}
if (stats[1] == 0) return 1.0f;
return stats[0] / static_cast<float>(stats[1]);
}
vector<int> CapMacroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
{
vector<int> stats(2 * kMaxNOC);
sentence_t intersection;
set_intersection(cand.begin(), cand.end(), ref.begin(), ref.end(),
inserter(intersection, intersection.begin()));
for (int i = 0; i < 2 * kMaxNOC; ++i) stats[i] = 0;
for (sentence_t::const_iterator it = intersection.begin(); it != intersection.end(); ++it) {
const int sempos = it->second;
++stats[2 * sempos];
}
for (sentence_t::const_iterator it = ref.begin(); it != ref.end(); ++it) {
const int sempos = it->second;
++stats[2 * sempos + 1];
}
return stats;
}
float CapMacroOverlapping::calculateScore(const vector<int>& stats) const
{
if (stats.size() != 2 * kMaxNOC) {
// TODO: Add some comments. The number "38" looks like a magic number.
throw std::runtime_error("Size of stats vector has to be 38");
}
int n = 0;
float sum = 0;
for (int i = 0; i < kMaxNOC; ++i) {
int clipped = stats[2 * i];
int refsize = stats[2 * i + 1];
if (refsize > 0) {
sum += clipped / (float) refsize;
++n;
}
}
if (n == 0) return 1;
return sum / n;
}

86
mert/SemposOverlapping.h Normal file
View File

@ -0,0 +1,86 @@
#ifndef MERT_SEMPOSOVERLAPPING_H_
#define MERT_SEMPOSOVERLAPPING_H_
#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>
// TODO: need to comments about this number.
const int kMaxNOC = 30;
typedef std::pair<std::string, std::string> str_item_t;
typedef std::vector<str_item_t> str_sentence_t;
typedef str_sentence_t::const_iterator str_sentence_it;
typedef std::pair<int,int> item_t;
typedef std::multiset<item_t> sentence_t;
typedef sentence_t::const_iterator sentence_it;
/**
* An interface for classes representing overlapping formulas
*/
class SemposOverlapping
{
public:
virtual ~SemposOverlapping() {}
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref) = 0;
virtual float calculateScore(const std::vector<int>& stats) const = 0;
virtual std::size_t NumberOfScores() const = 0;
};
class SemposOverlappingFactory {
public:
static SemposOverlapping* GetOverlapping(const std::string& str);
// dependency injection for unit testing.
static void SetOverlapping(SemposOverlapping* ovr);
private:
SemposOverlappingFactory() {}
~SemposOverlappingFactory() {}
};
/**
* Overlapping proposed by (Bojar and Machacek, WMT 2011)
*
* Please refer to the paper for details:
* http://aclweb.org/anthology-new/W/W11/W11-2108.pdf
*/
class CapMicroOverlapping : public SemposOverlapping
{
public:
CapMicroOverlapping() {}
~CapMicroOverlapping() {}
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const;
virtual std::size_t NumberOfScores() const { return 2; }
private:
// no copying allowed.
CapMicroOverlapping(const CapMicroOverlapping&);
CapMicroOverlapping& operator=(const CapMicroOverlapping&);
};
/**
* Overlapping proposed by (Kos and Bojar, 2009)
*/
class CapMacroOverlapping : public SemposOverlapping
{
public:
CapMacroOverlapping() {}
~CapMacroOverlapping() {}
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const;
virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; }
private:
// no copying allowed.
CapMacroOverlapping(const CapMacroOverlapping&);
CapMacroOverlapping& operator=(const CapMacroOverlapping&);
};
#endif // MERT_SEMPOSOVERLAPPING_H_

View File

@ -1,39 +1,30 @@
#include <sys/types.h>
#include <unistd.h>
#include <sstream>
#include "SemposScorer.h"
#include <algorithm>
#include <vector>
#include <stdexcept>
#include <algorithm>
#include <set>
#include <map>
#include <iterator>
#include "SemposScorer.h"
#include "Util.h"
SemposScorer::SemposScorer(const string& config)
: StatisticsBasedScorer("SEMPOS",config),
debug(false)
{
string debugSwitch = getConfig("debug", "0");
if (debugSwitch == "1") debug = true;
string overlapping = getConfig("overlapping", "cap-micro");
if (overlapping == "cap-micro") {
ovr = new CapMicroOverlapping();
} else if (overlapping == "cap-macro") {
ovr = new CapMacroOverlapping();
} else {
throw runtime_error("Unknown overlapping: " + overlapping);
}
using namespace std;
semposMap.clear();
SemposScorer::SemposScorer(const string& config)
: StatisticsBasedScorer("SEMPOS", config),
m_ovr(SemposOverlappingFactory::GetOverlapping(getConfig("overlapping", "cap-micro"))),
m_enable_debug(false)
{
const string& debugSwitch = getConfig("debug", "0");
if (debugSwitch == "1") m_enable_debug = true;
m_semposMap.clear();
}
SemposScorer::~SemposScorer() {}
void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
//make sure reference data is clear
ref_sentences.clear();
m_ref_sentences.clear();
//load reference data
for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
@ -41,10 +32,10 @@ void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
if (!refin) {
throw runtime_error("Unable to open: " + referenceFiles[rid]);
}
ref_sentences.push_back(vector<sentence_t>());
m_ref_sentences.push_back(vector<sentence_t>());
string line;
while (getline(refin,line)) {
line = applyFactors(line);
line = applyFactors(line);
str_sentence_t sentence;
splitSentence(line, sentence);
@ -52,68 +43,58 @@ void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
sentence_t encodedSentence;
encodeSentence(sentence, encodedSentence);
ref_sentences[rid].push_back(encodedSentence);
m_ref_sentences[rid].push_back(encodedSentence);
}
}
}
void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
vector<int> stats;
string sentence = applyFactors(text);
vector<ScoreStatsType> stats;
const string& sentence = applyFactors(text);
str_sentence_t splitCandSentence;
splitSentence(sentence, splitCandSentence);
sentence_t encodedCandSentence;
encodeSentence(splitCandSentence, encodedCandSentence);
if (ref_sentences.size() == 1) {
stats = ovr->prepareStats(encodedCandSentence, ref_sentences[0][sid]);
if (m_ref_sentences.size() == 1) {
stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]);
} else {
float max = -1;
for (size_t rid = 0; rid < ref_sentences.size(); ++rid) {
vector<int> tmp = ovr->prepareStats(encodedCandSentence, ref_sentences[rid][sid]);
if (ovr->calculateScore(tmp) > max) {
float max = -1.0f;
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]);
if (m_ovr->calculateScore(tmp) > max) {
stats = tmp;
}
}
}
stringstream sout;
copy(stats.begin(),stats.end(),ostream_iterator<int>(sout," "));
string stats_str = sout.str();
entry.set(stats_str);
}
float SemposScorer::calculateScore(const vector<int>& comps) const
{
return ovr->calculateScore(comps);
entry.set(stats);
}
void SemposScorer::splitSentence(const string& sentence, str_sentence_t& splitSentence)
{
splitSentence.clear();
vector<string> tokens;
vector<string> tokens;
split(sentence, ' ', tokens);
for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it)
{
vector<string> factors;
split(*it, '|', factors);
if (factors.size() != 2) throw runtime_error("Sempos scorer accepts two factors (item|class)");
string Item = factors[0];
string Class = factors[1];
splitSentence.push_back(make_pair(Item, Class));
const string& item = factors[0];
const string& klass = factors[1];
splitSentence.push_back(make_pair(item, klass));
}
}
void SemposScorer::encodeSentence(const str_sentence_t& sentence, sentence_t& encodedSentence)
{
for (str_sentence_it it = sentence.begin(); it != sentence.end(); ++it) {
int tlemma = encodeString(it->first);
int sempos = encodeSempos(it->second);
const int tlemma = encodeString(it->first);
const int sempos = encodeSempos(it->second);
if (sempos >= 0) {
encodedSentence.insert(make_pair(tlemma,sempos));
}
@ -122,11 +103,11 @@ void SemposScorer::encodeSentence(const str_sentence_t& sentence, sentence_t& en
int SemposScorer::encodeString(const string& str)
{
encoding_it encoding = stringMap.find(str);
encoding_it encoding = m_stringMap.find(str);
int encoded_str;
if (encoding == stringMap.end()) {
encoded_str = (int)stringMap.size();
stringMap[str] = encoded_str;
if (encoding == m_stringMap.end()) {
encoded_str = static_cast<int>(m_stringMap.size());
m_stringMap[str] = encoded_str;
} else {
encoded_str = encoding->second;
}
@ -136,15 +117,15 @@ int SemposScorer::encodeString(const string& str)
int SemposScorer::encodeSempos(const string& sempos)
{
if (sempos == "-") return -1;
encoding_it it = semposMap.find(sempos);
if (it == semposMap.end())
encoding_it it = m_semposMap.find(sempos);
if (it == m_semposMap.end())
{
if (semposMap.size() == maxNOC)
if (m_semposMap.size() == kMaxNOC)
{
throw std::runtime_error("Number of classes is greater than maxNOC");
throw std::runtime_error("Number of classes is greater than kMaxNOC");
}
int classNumber = semposMap.size();
semposMap[sempos] = classNumber;
const int classNumber = static_cast<int>(m_semposMap.size());
m_semposMap[sempos] = classNumber;
return classNumber;
}
else
@ -152,70 +133,3 @@ int SemposScorer::encodeSempos(const string& sempos)
return it->second;
}
}
SemposScorer::~SemposScorer()
{
delete ovr;
}
vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
{
vector<int> stats(2);
sentence_t intersection;
set_intersection(cand.begin(),cand.end(),ref.begin(),ref.end(), inserter(intersection, intersection.begin()));
stats[0] = intersection.size();
stats[1] = ref.size();
return stats;
}
float CapMicroOverlapping::calculateScore(const vector<int>& stats)
{
if (stats.size() != 2)
{
throw std::runtime_error("Size of stats vector has to be 2");
}
if (stats[1] == 0) return (float) 1;
return stats[0]/(float)stats[1];
}
vector<int> CapMacroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
{
vector<int> stats(2*maxNOC);
sentence_t intersection;
set_intersection(cand.begin(),cand.end(),ref.begin(),ref.end(), inserter(intersection, intersection.begin()));
for (int i = 0; i < 2*maxNOC; ++i) stats[i]=0;
for (sentence_t::const_iterator it = intersection.begin(); it != intersection.end(); ++it) {
int sempos = it->second;
++stats[2*sempos];
}
for (sentence_t::const_iterator it = ref.begin(); it != ref.end(); ++it) {
int sempos = it->second;
++stats[2*sempos+1];
}
return stats;
}
float CapMacroOverlapping::calculateScore(const vector<int>& stats)
{
if (stats.size() != 2*maxNOC) throw std::runtime_error("Size of stats vector has to be 38");
int n = 0;
float sum = 0;
for (int i = 0; i < maxNOC; ++i) {
int clipped = stats[2*i];
int refsize = stats[2*i+1];
if (refsize > 0) {
sum += clipped / (float) refsize;
++n;
}
}
if (n == 0) return 1;
return sum / n;
}

View File

@ -1,101 +1,60 @@
#ifndef __SEMPOSSCORER_H__
#define __SEMPOSSCORER_H__
#ifndef MERT_SEMPOSSCORER_H_
#define MERT_SEMPOSSCORER_H_
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <set>
#include <sstream>
#include <stdexcept>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include <limits.h>
#include <boost/scoped_ptr.hpp>
#include "Types.h"
#include "ScoreData.h"
#include "Scorer.h"
using namespace std;
// NOTE: This header should be included in .cpp file
// because SemposScorer wants to know what actual SemposOverlapping type is
// when we implement the scorer in .cpp file.
// However, currently SemposScorer uses a bunch of typedefs, which are
// used in SemposScorer as well as inherited SemposOverlapping classes.
#include "SemposOverlapping.h"
const int maxNOC = 30;
typedef pair<string,string> str_item_t;
typedef vector<str_item_t> str_sentence_t;
typedef str_sentence_t::const_iterator str_sentence_it;
typedef pair<int,int> item_t;
typedef multiset<item_t> sentence_t;
typedef sentence_t::const_iterator sentence_it;
// Base class for classes representing overlapping formulas
class SemposOverlapping
{
public:
virtual vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref) = 0;
virtual float calculateScore(const vector<int>& stats) = 0;
virtual size_t NumberOfScores() const = 0;
};
// Overlapping proposed by (Bojar and Machacek,2011);
class CapMicroOverlapping : public SemposOverlapping
{
public:
virtual vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const vector<int>& stats);
virtual size_t NumberOfScores() const {
return 2;
}
};
//Overlapping proposed by (Bojar and Kos,2009)
class CapMacroOverlapping : public SemposOverlapping
{
public:
virtual vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const vector<int>& stats);
virtual size_t NumberOfScores() const {
return maxNOC*2;
}
};
// This class represents sempos based metrics
/**
* This class represents sempos based metrics.
*/
class SemposScorer: public StatisticsBasedScorer
{
public:
SemposScorer(const string& config);
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry);
virtual size_t NumberOfScores() const {
return ovr->NumberOfScores();
};
explicit SemposScorer(const std::string& config);
~SemposScorer();
virtual float calculateScore(const vector<int>& comps) const;
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry);
virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); }
virtual float calculateScore(const std::vector<int>& comps) const {
return m_ovr->calculateScore(comps);
}
bool EnableDebug() const { return m_enable_debug; }
private:
SemposOverlapping* ovr;
vector<vector<sentence_t> > ref_sentences;
boost::scoped_ptr<SemposOverlapping> m_ovr;
std::vector<std::vector<sentence_t> > m_ref_sentences;
typedef map<string, int> encoding_t;
typedef encoding_t::iterator encoding_it;
typedef std::map<std::string, int> encoding_t;
typedef encoding_t::iterator encoding_it;
encoding_t semposMap;
encoding_t stringMap;
encoding_t m_semposMap;
encoding_t m_stringMap;
bool m_enable_debug;
void splitSentence(const string& sentence, str_sentence_t& splitSentence);
void splitSentence(const std::string& sentence, str_sentence_t& splitSentence);
void encodeSentence(const str_sentence_t& sentence, sentence_t& encodedSentence);
int encodeString(const string& str);
int encodeSempos(const string& sempos);
int encodeString(const std::string& str);
int encodeSempos(const std::string& sempos);
//no copy
// no copying allowed.
SemposScorer(const SemposScorer&);
SemposScorer& operator=(const SemposScorer&);
bool debug;
};
#endif //__BLEUSCORER_H
#endif // MERT_SEMPOSSCORER_H_