2012-03-23 20:52:24 +04:00
|
|
|
#include "SemposScorer.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
2012-03-23 17:39:24 +04:00
|
|
|
#include <vector>
|
|
|
|
#include <stdexcept>
|
2012-03-24 19:07:47 +04:00
|
|
|
#include <fstream>
|
2012-03-23 17:39:24 +04:00
|
|
|
|
|
|
|
#include "Util.h"
|
2012-03-24 19:07:47 +04:00
|
|
|
#include "SemposOverlapping.h"
|
2012-03-23 17:39:24 +04:00
|
|
|
|
2012-03-23 20:52:24 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
namespace MosesTuning
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
|
2012-03-23 17:39:24 +04:00
|
|
|
SemposScorer::SemposScorer(const string& config)
|
2012-03-23 20:52:24 +04:00
|
|
|
: StatisticsBasedScorer("SEMPOS", config),
|
2012-03-24 19:07:47 +04:00
|
|
|
m_ovr(SemposOverlappingFactory::GetOverlapping(getConfig("overlapping", "cap-micro"),this)),
|
2012-03-23 20:52:24 +04:00
|
|
|
m_enable_debug(false)
|
2012-03-23 17:39:24 +04:00
|
|
|
{
|
2012-03-23 20:52:24 +04:00
|
|
|
const string& debugSwitch = getConfig("debug", "0");
|
|
|
|
if (debugSwitch == "1") m_enable_debug = true;
|
2012-03-23 17:39:24 +04:00
|
|
|
|
2012-03-23 20:52:24 +04:00
|
|
|
m_semposMap.clear();
|
2012-03-24 19:07:47 +04:00
|
|
|
|
|
|
|
string weightsfile = getConfig("weightsfile", "");
|
2013-05-29 21:16:15 +04:00
|
|
|
if (weightsfile != "") {
|
2012-03-24 19:07:47 +04:00
|
|
|
loadWeights(weightsfile);
|
|
|
|
}
|
2012-03-23 17:39:24 +04:00
|
|
|
}
|
|
|
|
|
2012-03-23 20:52:24 +04:00
|
|
|
SemposScorer::~SemposScorer() {}
|
|
|
|
|
2012-03-23 17:39:24 +04:00
|
|
|
void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
|
|
|
{
|
|
|
|
//make sure reference data is clear
|
2012-03-23 20:52:24 +04:00
|
|
|
m_ref_sentences.clear();
|
2012-03-23 17:39:24 +04:00
|
|
|
|
|
|
|
//load reference data
|
|
|
|
for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
|
|
|
|
ifstream refin(referenceFiles[rid].c_str());
|
|
|
|
if (!refin) {
|
|
|
|
throw runtime_error("Unable to open: " + referenceFiles[rid]);
|
|
|
|
}
|
2012-03-23 20:52:24 +04:00
|
|
|
m_ref_sentences.push_back(vector<sentence_t>());
|
2012-03-23 17:39:24 +04:00
|
|
|
string line;
|
|
|
|
while (getline(refin,line)) {
|
2012-05-09 21:21:41 +04:00
|
|
|
line = preprocessSentence(line);
|
2012-03-23 17:39:24 +04:00
|
|
|
|
|
|
|
str_sentence_t sentence;
|
|
|
|
splitSentence(line, sentence);
|
|
|
|
|
|
|
|
sentence_t encodedSentence;
|
|
|
|
encodeSentence(sentence, encodedSentence);
|
|
|
|
|
2012-03-23 20:52:24 +04:00
|
|
|
m_ref_sentences[rid].push_back(encodedSentence);
|
2012-03-23 17:39:24 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
|
|
|
{
|
2012-03-23 20:52:24 +04:00
|
|
|
vector<ScoreStatsType> stats;
|
2012-03-23 17:39:24 +04:00
|
|
|
|
2012-05-09 21:21:41 +04:00
|
|
|
const string& sentence = preprocessSentence(text);
|
2012-03-23 17:39:24 +04:00
|
|
|
str_sentence_t splitCandSentence;
|
|
|
|
splitSentence(sentence, splitCandSentence);
|
|
|
|
|
|
|
|
sentence_t encodedCandSentence;
|
|
|
|
encodeSentence(splitCandSentence, encodedCandSentence);
|
|
|
|
|
2012-03-23 20:52:24 +04:00
|
|
|
if (m_ref_sentences.size() == 1) {
|
|
|
|
stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]);
|
2012-03-23 17:39:24 +04:00
|
|
|
} else {
|
2012-03-23 20:52:24 +04:00
|
|
|
float max = -1.0f;
|
|
|
|
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
|
|
|
|
const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]);
|
|
|
|
if (m_ovr->calculateScore(tmp) > max) {
|
2012-03-23 17:39:24 +04:00
|
|
|
stats = tmp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-03-23 20:52:24 +04:00
|
|
|
entry.set(stats);
|
2012-03-23 17:39:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void SemposScorer::splitSentence(const string& sentence, str_sentence_t& splitSentence)
|
|
|
|
{
|
2012-03-23 23:12:33 +04:00
|
|
|
splitSentence.clear();
|
|
|
|
|
|
|
|
vector<string> tokens;
|
|
|
|
split(sentence, ' ', tokens);
|
|
|
|
for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
|
|
|
|
vector<string> factors;
|
2012-05-13 13:11:13 +04:00
|
|
|
if (it->empty()) continue;
|
2012-03-23 23:12:33 +04:00
|
|
|
split(*it, '|', factors);
|
|
|
|
if (factors.size() != 2) throw runtime_error("Sempos scorer accepts two factors (item|class)");
|
|
|
|
const string& item = factors[0];
|
|
|
|
const string& klass = factors[1];
|
|
|
|
splitSentence.push_back(make_pair(item, klass));
|
|
|
|
}
|
2012-03-23 17:39:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void SemposScorer::encodeSentence(const str_sentence_t& sentence, sentence_t& encodedSentence)
|
|
|
|
{
|
|
|
|
for (str_sentence_it it = sentence.begin(); it != sentence.end(); ++it) {
|
2012-03-23 20:52:24 +04:00
|
|
|
const int tlemma = encodeString(it->first);
|
|
|
|
const int sempos = encodeSempos(it->second);
|
2012-03-23 17:39:24 +04:00
|
|
|
if (sempos >= 0) {
|
|
|
|
encodedSentence.insert(make_pair(tlemma,sempos));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int SemposScorer::encodeString(const string& str)
|
|
|
|
{
|
2012-03-23 20:52:24 +04:00
|
|
|
encoding_it encoding = m_stringMap.find(str);
|
2012-03-23 17:39:24 +04:00
|
|
|
int encoded_str;
|
2012-03-23 20:52:24 +04:00
|
|
|
if (encoding == m_stringMap.end()) {
|
|
|
|
encoded_str = static_cast<int>(m_stringMap.size());
|
|
|
|
m_stringMap[str] = encoded_str;
|
2012-03-23 17:39:24 +04:00
|
|
|
} else {
|
|
|
|
encoded_str = encoding->second;
|
|
|
|
}
|
|
|
|
return encoded_str;
|
|
|
|
}
|
|
|
|
|
|
|
|
int SemposScorer::encodeSempos(const string& sempos)
|
|
|
|
{
|
|
|
|
if (sempos == "-") return -1;
|
2012-03-23 20:52:24 +04:00
|
|
|
encoding_it it = m_semposMap.find(sempos);
|
2012-03-23 23:12:33 +04:00
|
|
|
if (it == m_semposMap.end()) {
|
2012-04-05 07:49:49 +04:00
|
|
|
const int classNumber = static_cast<int>(m_semposMap.size());
|
|
|
|
if (classNumber == kMaxNOC) {
|
2012-03-23 20:52:24 +04:00
|
|
|
throw std::runtime_error("Number of classes is greater than kMaxNOC");
|
2012-03-23 17:39:24 +04:00
|
|
|
}
|
2012-03-23 20:52:24 +04:00
|
|
|
m_semposMap[sempos] = classNumber;
|
2012-03-23 17:39:24 +04:00
|
|
|
return classNumber;
|
2012-03-23 23:12:33 +04:00
|
|
|
} else {
|
2012-03-23 17:39:24 +04:00
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
}
|
2012-03-24 19:07:47 +04:00
|
|
|
|
|
|
|
float SemposScorer::weight(int item) const
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
std::map<int,float>::const_iterator it = weightsMap.find(item);
|
|
|
|
if (it == weightsMap.end()) {
|
|
|
|
return 1.0f;
|
|
|
|
} else {
|
|
|
|
return it->second;
|
|
|
|
}
|
2012-03-24 19:07:47 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void SemposScorer::loadWeights(const string& weightsfile)
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
string line;
|
|
|
|
ifstream myfile;
|
|
|
|
myfile.open(weightsfile.c_str(), ifstream::in);
|
|
|
|
if (myfile.is_open()) {
|
|
|
|
while ( myfile.good() ) {
|
|
|
|
getline (myfile,line);
|
|
|
|
vector<string> fields;
|
|
|
|
if (line == "") continue;
|
|
|
|
split(line, '\t', fields);
|
|
|
|
if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
|
|
|
|
int encoded = encodeString(fields[0]);
|
|
|
|
float weight = atof(fields[1].c_str());
|
|
|
|
weightsMap[encoded] = weight;
|
2012-04-05 07:49:49 +04:00
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
myfile.close();
|
|
|
|
} else {
|
|
|
|
cerr << "Unable to open file "<< weightsfile << endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
2012-03-24 19:07:47 +04:00
|
|
|
|
|
|
|
}
|
2012-06-30 23:23:45 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|